From 991de6cbe4ac778fe2543fa9004e44438e40800a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 14:49:56 +0400 Subject: [PATCH 01/98] v1 --- src/llama-arch.cpp | 59 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ab24054305857..635a5bdbc9b65 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -46,6 +46,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA2, "mamba2" }, + { LLM_ARCH_FALCON_H1, "falcon-h1" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, @@ -118,7 +119,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, - { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, + { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_SWIN_NORM, "%s.swin_norm" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, @@ -127,6 +128,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, + { LLM_KV_ATTN_HEAD_DIM, "%s.attention.head_dim" }, { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, @@ -216,6 +218,31 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, + { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, + { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, + { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, + { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, + { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" }, + { LLM_KV_SSM_HEAD_DIM, "%s.ssm.head_dim" }, + { LLM_KV_MAMBA_D_SSM, "%s.ssm.mamba_d_ssm" }, + + { LLM_KV_FALCON_H1_USE_MLP, "%s.mamba_use_mlp" }, + { LLM_KV_FALCON_H1_ATTENTION_IN_MULTIPLIER, "%s.attention_in_multiplier" }, + { LLM_KV_FALCON_H1_ATTENTION_OUT_MULTIPLIER, "%s.attention_out_multiplier" }, + { LLM_KV_FALCON_H1_SSM_IN_MULTIPLIER, "%s.ssm_in_multiplier" }, + { LLM_KV_FALCON_H1_SSM_OUT_MULTIPLIER, "%s.ssm_out_multiplier" }, + { LLM_KV_FALCON_H1_MLP_GATE_MULTIPLIER, "%s.mlp_gate_multiplier" }, + { LLM_KV_FALCON_H1_MLP_DOWN_MULTIPLIER, "%s.mlp_down_multiplier" }, + { LLM_KV_FALCON_H1_SSM_HAS_MUP, "%s.ssm.has_mup" }, + { LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, "%s.mamba_norm_before_gate" }, + { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, + { LLM_KV_FALCON_H1_ROPE_THETA, "%s.rope_theta" }, + { LLM_KV_FALCON_H1_KEY_MULTIPLIER, "%s.key_multiplier" }, + { LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, "%s.lm_head_multiplier" }, + { LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" }, + { LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, "%s.ssm.mamba_chunk_size" }, + { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -1022,6 +1049,31 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_FALCON_H1, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_MUP_VEC, "blk.%d.ssm_mup_vec" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + { LLM_TENSOR_FFN_PRE_NORM, "blk.%d.ffn_pre_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_XVERSE, { @@ -1711,6 +1763,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_FINAL_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ROPE_FREQS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}}, @@ -1780,6 +1833,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}}, {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SSM_MUP_VEC, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -1812,6 +1866,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_FFN_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_CROSS_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -1928,6 +1983,8 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { // TODO: There are currently no hybrid models! Once there are, this will be // the place to identify them switch (arch) { + case LLM_ARCH_FALCON_H1: + return true; default: return false; } From f897efdaf637d021e1e591a0aa1595e9339f02e5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 15:05:01 +0400 Subject: [PATCH 02/98] push more fixes --- src/llama-arch.h | 25 +++ src/llama-context.cpp | 16 +- src/llama-graph.cpp | 7 + src/llama-hparams.cpp | 11 +- src/llama-hparams.h | 25 +++ src/llama-memory-hybrid.cpp | 2 +- src/llama-model.cpp | 422 ++++++++++++++++++++++++++++++++++++ src/llama-model.h | 7 + src/llama-vocab.cpp | 1 + 9 files changed, 505 insertions(+), 11 deletions(-) diff --git a/src/llama-arch.h b/src/llama-arch.h index b769831dff5ec..c05cb851978bd 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -50,6 +50,7 @@ enum llm_arch { LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_MAMBA2, + LLM_ARCH_FALCON_H1, LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, LLM_ARCH_COHERE2, @@ -156,6 +157,27 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_LAYER_INDICES, + // Falcon-H1 specific + LLM_KV_ATTN_HEAD_DIM, + LLM_KV_SSM_HEAD_DIM, + LLM_KV_MAMBA_D_SSM, + LLM_KV_N_LAYER, + LLM_KV_FALCON_H1_USE_MLP, + LLM_KV_FALCON_H1_ATTENTION_IN_MULTIPLIER, + LLM_KV_FALCON_H1_ATTENTION_OUT_MULTIPLIER, + LLM_KV_FALCON_H1_SSM_IN_MULTIPLIER, + LLM_KV_FALCON_H1_SSM_OUT_MULTIPLIER, + LLM_KV_FALCON_H1_MLP_GATE_MULTIPLIER, + LLM_KV_FALCON_H1_MLP_DOWN_MULTIPLIER, + LLM_KV_FALCON_H1_SSM_HAS_MUP, + LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, + LLM_KV_FALCON_H1_MAMBA_RMS_NORM, + LLM_KV_FALCON_H1_ROPE_THETA, + LLM_KV_FALCON_H1_KEY_MULTIPLIER, + LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, + LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, + LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, + LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, @@ -389,6 +411,9 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_SSM_MUP_VEC, + LLM_TENSOR_FFN_PRE_NORM, + LLM_TENSOR_FINAL_NORM, }; enum llm_tensor_layer { diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 06e93b19cbf40..e319beaa98ee2 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2220,14 +2220,14 @@ llama_context * llama_init_from_model( return nullptr; } - try { - auto * ctx = new llama_context(*model, params); - return ctx; - } catch (const std::exception & err) { - LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what()); - } - - return nullptr; + // try { + auto * ctx = new llama_context(*model, params); + return ctx; + // } catch (const std::exception & err) { + // LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what()); + // } + + // return nullptr; } // deprecated diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4443420132f2b..eea8207c143cf 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -545,6 +545,10 @@ ggml_tensor * llm_graph_context::build_ffn( case LLM_FFN_PAR: { cur = build_lora_mm(gate, cur); + if (arch == LLM_ARCH_FALCON_H1) { + cur = ggml_scale(ctx0, cur, hparams.mlp_gate_multiplier); + } + cb(cur, "ffn_gate", il); } break; } @@ -631,6 +635,9 @@ ggml_tensor * llm_graph_context::build_ffn( // GLM4 seems to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } + if (arch == LLM_ARCH_FALCON_H1) { + cur = ggml_scale(ctx0, cur, hparams.mlp_down_multiplier); + } } if (down_b) { diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 86c814d51b901..3196138b359af 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -74,7 +74,14 @@ uint32_t llama_hparams::n_embd_r() const { // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); + + // check if the architecture is using d_ssm + if (ssm_mamba_d_ssm > 0) { + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_mamba_d_ssm + 2*ssm_n_group*ssm_d_state); + } else { + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); + } + } uint32_t llama_hparams::n_embd_s() const { @@ -84,7 +91,7 @@ uint32_t llama_hparams::n_embd_s() const { } // corresponds to Mamba's ssm_states size - return ssm_d_state * ssm_d_inner; + return (ssm_mamba_d_ssm > 0 ? ssm_d_state * ssm_mamba_d_ssm : ssm_d_state * ssm_d_inner); } bool llama_hparams::is_recurrent(uint32_t il) const { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 476d0a5eade28..d671edaa4d04d 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -115,6 +115,31 @@ struct llama_hparams { uint32_t ssm_d_state = 0; uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; + uint32_t ssm_head_dim = 0; + uint32_t ssm_mamba_d_ssm = 0; + + uint32_t attn_head_dim = 0; + bool mamba_use_mlp = false; + bool mamba_norm_before_gate = false; + bool mamba_rms_norm = false; + float attention_in_multiplier = 1.0f; + float attention_out_multiplier = 1.0f; + float ssm_in_multiplier = 1.0f; + float ssm_out_multiplier = 1.0f; + float mlp_gate_multiplier = 1.0f; + float mlp_down_multiplier = 1.0f; + float key_multiplier = 1.0f; + float lm_head_multiplier = 1.0f; + float rope_theta = 10000.0f; + bool ssm_has_mup = false; + float embedding_multiplier = 1.0f; + uint32_t vocab_size = 0; + uint32_t intermediate_size = 0; + float mamba_expand = 0.0f; + bool ssm_rms_norm = false; + bool ssm_conv_bias = false; + bool ssm_proj_bias = false; + uint32_t chunk_size = 0; // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 03d974d852039..9a85e238ddda4 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -32,7 +32,7 @@ llama_memory_hybrid::llama_memory_hybrid( mem_attn(new llama_kv_cache_unified( model, filter_attn == nullptr ? - [&](int32_t il) { return !hparams.is_recurrent(il); } + [&](int32_t il) { return hparams.is_recurrent(il); } : filter_attn, type_k, type_v, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0573c5bcea0a4..607f5595c6504 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1549,6 +1549,53 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_FALCON_H1: + { + // Common parameters + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.vocab_size); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + // SSM parameters + ml.get_key(LLM_KV_MAMBA_D_SSM, hparams.ssm_mamba_d_ssm); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + ml.get_key(LLM_KV_SSM_HEAD_DIM, hparams.ssm_head_dim); + ml.get_key(LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, hparams.chunk_size); + + // Falcon-H1 parameters + ml.get_key(LLM_KV_ATTN_HEAD_DIM, hparams.attn_head_dim); + ml.get_key(LLM_KV_FALCON_H1_USE_MLP, hparams.mamba_use_mlp); + ml.get_key(LLM_KV_FALCON_H1_ATTENTION_IN_MULTIPLIER, hparams.attention_in_multiplier); + ml.get_key(LLM_KV_FALCON_H1_ATTENTION_OUT_MULTIPLIER, hparams.attention_out_multiplier); + ml.get_key(LLM_KV_FALCON_H1_SSM_IN_MULTIPLIER, hparams.ssm_in_multiplier); + ml.get_key(LLM_KV_FALCON_H1_SSM_OUT_MULTIPLIER, hparams.ssm_out_multiplier); + ml.get_key(LLM_KV_FALCON_H1_MLP_GATE_MULTIPLIER, hparams.mlp_gate_multiplier); + ml.get_key(LLM_KV_FALCON_H1_MLP_DOWN_MULTIPLIER, hparams.mlp_down_multiplier); + ml.get_key(LLM_KV_FALCON_H1_SSM_HAS_MUP, hparams.ssm_has_mup); + ml.get_key(LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, hparams.mamba_norm_before_gate); + ml.get_key(LLM_KV_FALCON_H1_MAMBA_RMS_NORM, hparams.mamba_rms_norm); + ml.get_key(LLM_KV_FALCON_H1_ROPE_THETA, hparams.rope_theta); + ml.get_key(LLM_KV_FALCON_H1_KEY_MULTIPLIER, hparams.key_multiplier); + ml.get_key(LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, hparams.lm_head_multiplier); + ml.get_key(LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, hparams.embedding_multiplier); + + switch (hparams.n_layer) { + case 36: + type = LLM_TYPE_0_5B; break; + case 24: + type = LLM_TYPE_1_5B; break; + case 66: + type = LLM_TYPE_1B; break; + case 32: + type = LLM_TYPE_3B; break; + case 44: + type = LLM_TYPE_7B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -4475,6 +4522,88 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_FALCON_H1: + { + // Common + const int64_t hidden_size = hparams.n_embd; // hidden_size + const int64_t vocab_size = hparams.vocab_size; // vocab_size + + // mamba2 Mixer SSM params + const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size + const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups + const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size + const int64_t ssm_intermediate_size = hparams.ssm_mamba_d_ssm > 0 ? hparams.ssm_mamba_d_ssm : int(hparams.mamba_expand * hidden_size); // TODO expand + const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads + const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size; + const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads; + + // attn params + const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head + const int64_t attn_num_key_value_head = hparams.n_head_kv(0); + const int64_t attn_head_dim = hparams.attn_head_dim > 0 ? hparams.attn_head_dim : hidden_size / attn_num_attention_head; + + // ffn params + const int64_t ffn_intermediate_size = hparams.n_ff(0); + + // embeddings + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, vocab_size}, 0); + + // output + { + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, vocab_size}, TENSOR_NOT_REQUIRED); + final_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + /*SSM LAYERS*/ + // ssm in + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0); + layer.ssm_in_b = create_tensor(tn(LLM_TENSOR_SSM_IN, "bias", i), {n_embd, ssm_projection_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + // ssm 1d conv + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, llama_model_loader::TENSOR_NOT_REQUIRED); + // ssm_dt + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0); + // no "weight" suffix for these + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); + if (hparams.ssm_has_mup == true) { + layer.ssm_mup_vec = create_tensor(tn(LLM_TENSOR_SSM_MUP_VEC, i), {2*ssm_intermediate_size + 2*ssm_n_groups*ssm_state_size + ssm_num_heads}, 0); + } + // ssm_norm + if (hparams.mamba_rms_norm == true) { + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); + } + // out_proj + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); + + /*ATTENTION LAYERS*/ + // attention layers (with optional bias) + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, attn_head_dim * attn_num_attention_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * attn_head_dim}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * attn_head_dim}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {attn_head_dim * attn_num_attention_head, hidden_size}, 0); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * attn_head_dim}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * attn_head_dim}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0); + + + // feed forward (w/ optional biases) + layer.ffn_pre_norm = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM, i), {hidden_size}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0); + + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -14525,6 +14654,285 @@ struct llm_build_ernie4_5 : public llm_graph_context { } }; +struct llm_build_falcon_h1 : public llm_graph_context { + const llama_model & model; + + llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + inpL = ggml_scale(ctx0, inpL, hparams.embedding_multiplier); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // Build the inputs in the recurrent & kv cache + auto * inp = build_inp_mem_hybrid(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + cur = ggml_scale(ctx0, cur, hparams.attention_in_multiplier); + + // self-attention + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = ggml_scale(ctx0, Kcur, hparams.key_multiplier); + + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, 0, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, 0, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + ggml_tensor * attn_out = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + attn_out = ggml_scale(ctx0, attn_out, hparams.attention_out_multiplier); + cb(attn_out, "attn_out", il); + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + // Mamba2 layer + cur = ggml_scale(ctx0, cur, hparams.ssm_in_multiplier); + cb(cur, "ssm_in", il); + + ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il); + ssm_out = ggml_scale(ctx0, ssm_out, hparams.ssm_out_multiplier); + cb(ssm_out, "ssm_out", il); + + // // Aggregation + cur = ggml_add(ctx0, attn_out, ssm_out); + inpSA = ggml_add(ctx0, cur, inpSA); + cb(cur, "layer_out", il); + + if (il == n_layer - 1) { + // skip computing output for unused tokens + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = inpSA; + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_pre_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.final_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + cur = ggml_scale(ctx0, cur, hparams.lm_head_multiplier); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * build_mamba2_layer( + llm_graph_input_mem_hybrid * inp, + ggml_cgraph * gf, + ggml_tensor * cur, + const llama_ubatch & ubatch, + int il) const { + const auto * kv_state = static_cast(mctx)->get_recr(); + + const auto kv_head = kv_state->get_head(); + + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_ssm = hparams.ssm_mamba_d_ssm; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_head = hparams.ssm_dt_rank; + const int64_t head_dim = hparams.ssm_head_dim == 0 ? d_inner / n_head : hparams.ssm_head_dim; + const int64_t n_group = hparams.ssm_n_group; + const int64_t n_seqs = ubatch.n_seqs; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + ggml_tensor * conv_states_all = kv_state->get_r_l(il); + ggml_tensor * ssm_states_all = kv_state->get_s_l(il); + + ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_ssm + 2*n_group*d_state, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads + + // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs} + ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); + + + // check if the models has ssm_multipliers (MuP) + if (hparams.ssm_has_mup) { + struct ggml_tensor * mup_vec = model.layers[il].ssm_mup_vec; + cur = ggml_mul(ctx0, zxBCdt, mup_vec); + zxBCdt = cur; + } + + // split the above in three + ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0); + ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_ssm + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_ssm*ggml_element_size(zxBCdt)); + ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_ssm + 2*n_group*d_state)*ggml_element_size(zxBCdt)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs} + ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0); + + + // copy last (d_conv - 1) columns back into the state cache + ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_ssm + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_ssm + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_ssm + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + + // bias + xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b); + + xBC = ggml_silu(ctx0, xBC); + } + + // ssm + { + // These correspond to V K Q in SSM/attention duality + ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0); + + ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_ssm*ggml_element_size(xBC)); + + ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_ssm + n_group*d_state)*ggml_element_size(xBC)); + + // {n_head, n_seq_tokens, n_seqs} + dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b); + + + ggml_tensor * A = model.layers[il].ssm_a; + + // use the states and the indices provided by build_rs + // (this is necessary in order to properly use the states before they are overwritten, + // while avoiding to make unnecessary copies of the states) + auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { + ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size()); + + // TODO: use semistructured matrices to implement state-space duality + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); + }; + + ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + + // store last states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_ssm*n_seqs, ggml_nelements(x)*x->nb[0]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_ssm*n_seqs, kv_head*d_state*d_ssm*ggml_element_size(ssm_states_all)))); + + ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0); + + // TODO: skip computing output earlier for unused tokens + + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); + + // grouped RMS norm + if (hparams.mamba_rms_norm){ + y = ggml_reshape_4d(ctx0, y, d_ssm / n_group, n_group, n_seq_tokens, n_seqs); + y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); + } + y = ggml_reshape_3d(ctx0, y, d_ssm, n_seq_tokens, n_seqs); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + cb(cur, "mamba_out", il); + return cur; + } +}; + struct llm_build_arcee : public llm_graph_context { llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -14693,6 +15101,15 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); + // -> attn_filter + // if falcon-h1 -> [&](int32_t il) { return true; } + // case LLM_ARCH_FALCON_H1: + // llama_memory_hybrid::layer_filter_cb filter_attn = [](int32_t /*il*/) { return true; }; + // llama_memory_hybrid::layer_filter_cb filter_recr = [](int32_t /*il*/) { return true; }; + // default: + // llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; + // llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; + res = new llama_memory_hybrid( /* model */ *this, /* attn_type_k */ params.type_k, @@ -15040,6 +15457,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_FALCON_H1: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -15193,6 +15614,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_NEO_BERT: case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: + case LLM_ARCH_FALCON_H1: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/src/llama-model.h b/src/llama-model.h index 979fff62045f9..fc235cd23d4c0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -173,6 +173,7 @@ struct llama_layer { struct ggml_tensor * attn_norm_cross = nullptr; struct ggml_tensor * attn_norm_enc = nullptr; struct ggml_tensor * ssm_norm = nullptr; + struct ggml_tensor * final_norm = nullptr; // attention struct ggml_tensor * wq = nullptr; @@ -215,6 +216,7 @@ struct llama_layer { struct ggml_tensor * layer_out_norm_b = nullptr; struct ggml_tensor * ffn_norm_exps = nullptr; struct ggml_tensor * ffn_norm_enc = nullptr; + struct ggml_tensor * ffn_pre_norm = nullptr; // ff struct ggml_tensor * ffn_gate = nullptr; // w1 @@ -224,6 +226,10 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; + // falcon-h1 + struct ggml_tensor * ssm_in_b = nullptr; + struct ggml_tensor * ssm_mup_vec = nullptr; + // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; struct ggml_tensor * ffn_gate_exps = nullptr; @@ -361,6 +367,7 @@ struct llama_model { struct ggml_tensor * output = nullptr; struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + struct ggml_tensor * final_norm = nullptr; // classifier struct ggml_tensor * cls = nullptr; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 5c9eb87566dde..d492515609ca3 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1522,6 +1522,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || + tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; From 71a6848e2d2b8484f4b1512bf5e4bfc1ecf4b215 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 15:08:23 +0400 Subject: [PATCH 03/98] another fix --- src/llama-model.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 607f5595c6504..4f8333cb69bbc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1582,6 +1582,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, hparams.lm_head_multiplier); ml.get_key(LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, hparams.embedding_multiplier); + std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); + switch (hparams.n_layer) { case 36: type = LLM_TYPE_0_5B; break; From 03568c93587c971bf87b3638017a9de81357cd26 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 15:10:18 +0400 Subject: [PATCH 04/98] fix --- src/llama-model.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4f8333cb69bbc..fb1850b490d8f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14674,8 +14674,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // Build the inputs in the recurrent & kv cache auto * inp = build_inp_mem_hybrid(); - auto * inp_attn = build_attn_inp_kv_unified(); - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; for (int il = 0; il < n_layer; ++il) { @@ -14718,7 +14716,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - ggml_tensor * attn_out = build_attn(inp_attn, gf, + ggml_tensor * attn_out = build_attn(inp, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); attn_out = ggml_scale(ctx0, attn_out, hparams.attention_out_multiplier); From 0c93ef6a9c4656f59783f97b53edec8c58c0557c Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 15:26:33 +0400 Subject: [PATCH 05/98] more fixes --- src/llama-memory-hybrid.cpp | 2 +- src/llama-model.cpp | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index 9a85e238ddda4..03d974d852039 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -32,7 +32,7 @@ llama_memory_hybrid::llama_memory_hybrid( mem_attn(new llama_kv_cache_unified( model, filter_attn == nullptr ? - [&](int32_t il) { return hparams.is_recurrent(il); } + [&](int32_t il) { return !hparams.is_recurrent(il); } : filter_attn, type_k, type_v, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fb1850b490d8f..5285e13f3ef9e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15101,14 +15101,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); - // -> attn_filter - // if falcon-h1 -> [&](int32_t il) { return true; } - // case LLM_ARCH_FALCON_H1: - // llama_memory_hybrid::layer_filter_cb filter_attn = [](int32_t /*il*/) { return true; }; - // llama_memory_hybrid::layer_filter_cb filter_recr = [](int32_t /*il*/) { return true; }; - // default: - // llama_memory_hybrid::layer_filter_cb filter_attn = nullptr; - // llama_memory_hybrid::layer_filter_cb filter_recr = nullptr; res = new llama_memory_hybrid( /* model */ *this, @@ -15123,7 +15115,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_type_v */ GGML_TYPE_F32, /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, - /* offload */ cparams.offload_kqv); + /* offload */ cparams.offload_kqv, + /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, + /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); } else { const auto padding = llama_kv_cache_unified::get_padding(cparams); From fdd5cff4ba70444b4705b8d16895cd1a8541585c Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 17:12:05 +0400 Subject: [PATCH 06/98] minor fix --- src/llama-model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5285e13f3ef9e..7ae5caf4e4c17 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10203,7 +10203,7 @@ struct llm_build_mamba : public llm_graph_context { // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); - // cb(cur, "mamba_out", il); + cb(cur, "mamba_out", il); return cur; } @@ -14697,6 +14697,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Kcur = ggml_scale(ctx0, Kcur, hparams.key_multiplier); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); From 14c37ec047e41a310a2261cbf5ee43140b07e11c Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Thu, 3 Jul 2025 18:09:30 +0400 Subject: [PATCH 07/98] more cleaning on python code --- convert_hf_to_gguf.py | 138 +++++++++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 40 ++++++++++ gguf-py/gguf/gguf_writer.py | 9 +++ gguf-py/gguf/tensor_mapping.py | 16 ++++ src/llama-model.cpp | 1 + 5 files changed, 204 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dd80a4a05d596..3566821890310 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6535,6 +6535,144 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) + +@ModelBase.register("FalconH1ForCausalLM") +class FalconH1Model(Mamba2Model): + model_arch = gguf.MODEL_ARCH.FALCON_H1 + + def __init__(self, *args, **kwargs): + # Set the hparam prefixes for Falcon Mamba2 + self.hparam_prefixes = ["mamba"] + + # Initialize the base Mamba2Model + super().__init__(*args, **kwargs) + + # Use Llama conversion for attention + self._transformer_model_class = LlamaModel + + # n_group and d_inner are used during reshape_tensors for mamaba2 + self.d_model = self.find_hparam(["hidden_size", "d_model"]) + self.n_group = self.find_hparam(["n_groups"]) + self.d_inner = self.find_hparam(["expand"]) * self.d_model + + # Initialize any Falcon Mamba2 specific attributes + self.has_attention = True # Falcon Mamba2 has attention components + + # Load Falcon-H1 multipliers from hyperparameters + self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True) + self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True) + self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True) + self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True) + self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) + self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) + self.intermediate_size = self.find_hparam(["intermediate_size"]) + + def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: + prefixed = [] + for pfx in self.hparam_prefixes: + prefixed.extend( + "_".join([pfx, k]) + for k in keys + ) + keys = list(keys) + prefixed + return super().find_hparam(keys, *args, **kwargs) + + def _generate_mup_vector(self, block_id: int) -> torch.Tensor: + zxbcdt_multipliers = self.hparams["ssm_multipliers"] + intermediate_size = self.hparams["mamba_d_ssm"] + groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] + vector_shape = (2 * intermediate_size + 2 * groups_time_state_size + self.hparams["mamba_n_heads"]) + + mup_vector = torch.ones(1, 1, vector_shape) + mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0] + mup_vector[:, :, intermediate_size:2 * intermediate_size] *= zxbcdt_multipliers[1] + mup_vector[:, :, 2 * intermediate_size:2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2] + mup_vector[:, :, 2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size] *= zxbcdt_multipliers[3] + mup_vector[:, :, 2 * intermediate_size + 2 * groups_time_state_size:] *= zxbcdt_multipliers[4] + + return mup_vector + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, tensor in super().get_tensors(): + if name.startswith("model.backbone") or name.startswith("model.lm_head"): + name = name.removeprefix("model.") + yield name, tensor + + if self.ssm_multipliers is not None: + # Insert MUP vector after mamba.dt_bias + if "mamba.dt_bias" in name: + block_match = re.search(r"(?:model\.layers\.)?(\d+)\.mamba\.dt_bias", name) + if block_match: + block_id = int(block_match.group(1)) + # Generate MUP vector with correct name format + mup_tensor = self._generate_mup_vector(block_id) + mup_name = f"blk.{block_id}.ssm_mup_vec" + logger.debug(f"Inserting MUP vector for block {block_id}: {mup_name}") + yield mup_name, mup_tensor + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + ## General Params ## + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + + ## Mamba mixer params ## + self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) + self.gguf_writer.add_ssm_group_count(self.n_group) + self.gguf_writer.add_ssm_inner_size(self.d_inner) + self.gguf_writer.add_ssm_head_dim(d_head := self.find_hparam(["d_head"])) + self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"])) + + ## Attention params ## + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in self.hparams else self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(self.hparams["head_dim"]) + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + self.gguf_writer.add_float32("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) + + ## Other params + self.gguf_writer.add_float32("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + self.gguf_writer.add_float32("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + + ## Validation ## + assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" + assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" + + + # Add Falcon Mamba2 specific configuration + self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"]) + self.gguf_writer.add_uint32("falcon-h1.attention.head_dim", self.hparams["head_dim"]) + self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_uint32("falcon-h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_uint32("falcon-h1.num_key_value_heads", + self.find_hparam(["num_key_value_heads"], optional=True) or + self.find_hparam(["num_attention_heads"])) + + # Add multipliers as metadata instead of tensors + self.gguf_writer.add_float32("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) + self.gguf_writer.add_float32("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) + self.gguf_writer.add_float32("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) + self.gguf_writer.add_float32("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) + + # Add MLP multipliers + if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: + self.gguf_writer.add_float32("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + self.gguf_writer.add_float32("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) + + # Add has MuP flag if SSM multipliers are present + if self.ssm_multipliers is not None: + self.gguf_writer.add_bool("falcon-h1.ssm.has_mup", True) + + # Add any other Falcon Mamba2 specific configuration + self.gguf_writer.add_bool("falcon-h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) + self.gguf_writer.add_bool("falcon-h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) + self.gguf_writer.add_bool("falcon-h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) + self.gguf_writer.add_float32("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c12609c6d9f99..08223927a7cbb 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -172,6 +172,7 @@ class SSM: TIME_STEP_RANK = "{arch}.ssm.time_step_rank" GROUP_COUNT = "{arch}.ssm.group_count" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" + HEAD_DIM = "{arch}.ssm.head_dim" class WKV: HEAD_SIZE = "{arch}.wkv.head_size" @@ -288,6 +289,7 @@ class MODEL_ARCH(IntEnum): LLAMA4 = auto() DECI = auto() FALCON = auto() + FALCON_H1 = auto() BAICHUAN = auto() GROK = auto() GPT2 = auto() @@ -525,6 +527,7 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + SSM_MUP_VEC = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() @@ -660,6 +663,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", MODEL_ARCH.ERNIE4_5: "ernie4_5", + MODEL_ARCH.FALCON_H1: "falcon_h1", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -736,6 +740,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.SSM_MUP_VEC: "blk.{bid}.ssm_mup_vec", MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -2211,6 +2216,41 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.FALCON_H1: [ + # Token embedding + MODEL_TENSOR.TOKEN_EMBD, + + # Input layernorm + MODEL_TENSOR.ATTN_NORM, + + # Attention components + MODEL_TENSOR.ATTN_Q, # Query projection + MODEL_TENSOR.ATTN_K, # Key projection + MODEL_TENSOR.ATTN_V, # Value projection + MODEL_TENSOR.ATTN_OUT, # Output projection + + # SSM components (Mamba2 specific) + MODEL_TENSOR.SSM_MUP_VEC, # Mup vector + MODEL_TENSOR.SSM_IN, # Input projection for SSM + MODEL_TENSOR.SSM_CONV1D, # Convolution layer + MODEL_TENSOR.SSM_DT, # Delta time projection + MODEL_TENSOR.SSM_A, # A parameter (log form) + MODEL_TENSOR.SSM_D, # D parameter + MODEL_TENSOR.SSM_NORM, # Normalization in SSM + MODEL_TENSOR.SSM_OUT, # Output projection + + # Pre-feedforward layernorm + MODEL_TENSOR.FFN_PRE_NORM, + + # Feed-forward network components + MODEL_TENSOR.FFN_GATE, # Gate projection (SwiGLU) + MODEL_TENSOR.FFN_DOWN, # Down projection + MODEL_TENSOR.FFN_UP, # Up projection + + # Post-feedforward layernorm + MODEL_TENSOR.OUTPUT_NORM, # Final layer norm + MODEL_TENSOR.OUTPUT, # Output projection (lm_head) + ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a7ecf3d31209f..7043fa375956c 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -867,6 +867,15 @@ def add_ssm_group_count(self, value: int) -> None: def add_ssm_dt_b_c_rms(self, value: bool) -> None: self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) + def add_ssm_head_dim(self, value: int) -> None: + self.add_uint32(Keys.SSM.HEAD_DIM.format(arch=self.arch), value) + + def add_attn_head_count(self, count: int) -> None: + self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) + + def add_key_value_head_count(self, count: int) -> None: + self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) + def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 51634ef6bdd2e..be3fff848b2e4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -286,12 +286,14 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_PRE_NORM: ( "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + "model.layers.{bid}.pre_ff_layernorm.weight", ), # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 + "model.layers.{bid}.feed_forward.up_proj", ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -362,6 +364,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 + "model.layers.{bid}.feed_forward.down_proj", ), # AWQ-activation gate @@ -547,11 +550,13 @@ class TensorNameMap: MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", + "model.layers.{bid}.mamba.in_proj", ), MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", + "model.layers.{bid}.mamba.conv1d", ), MODEL_TENSOR.SSM_X: ( @@ -562,16 +567,19 @@ class TensorNameMap: MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", + "model.layers.{bid}.mamba.dt_proj", ), MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", + "model.layers.{bid}.mamba.A_log", ), MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", + "model.layers.{bid}.mamba.D", ), MODEL_TENSOR.SSM_NORM: ( @@ -1168,6 +1176,14 @@ class TensorNameMap: "resampler.attn.out_proj", ), + MODEL_TENSOR.SSM_MUP_VEC: ( + "model.layers.{bid}.mamba.mup_vector", # falcon-h1 + ), + + MODEL_TENSOR.SSM_NORM: ( + "model.layers.{bid}.mamba.norm", + ), + MODEL_TENSOR.V_RESMPL_KV: ( "resampler.kv_proj", ), diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7ae5caf4e4c17..fc822331aaea2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14921,6 +14921,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { y = ggml_reshape_4d(ctx0, y, d_ssm / n_group, n_group, n_seq_tokens, n_seqs); y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); } + y = ggml_reshape_3d(ctx0, y, d_ssm, n_seq_tokens, n_seqs); // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} From 8bea92261eaf5984818dbae487ebb7e0e82c6914 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 14:32:11 +0400 Subject: [PATCH 08/98] python fixes --- convert_hf_to_gguf.py | 14 ++++++++++++-- gguf-py/gguf/tensor_mapping.py | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3566821890310..193fee8d2d0fc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -686,6 +686,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base res = "falcon3" + if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df + res = "falcon-H1" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 res = "bert-bge-large" @@ -4905,8 +4908,11 @@ def set_gguf_parameters(self): # Fail early for models which don't have a block expansion factor of 2 # TODO: does this really matter? - assert d_inner == 2 * d_model - assert d_inner % head_dim == 0 + # skip the assertion for FalconH1 Model + architectures = self.hparams.get("architectures") + if architectures is None or architectures[0] != "FalconH1ForCausalLM": + assert d_inner == 2 * d_model + assert d_inner % head_dim == 0 self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) @@ -4945,6 +4951,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model n_group = self.hparams.get("n_groups", 1) + architectures = self.hparams.get("architectures") + if architectures is not None and architectures[0] == "FalconH1ForCausalLM": + # FalconH1F has a different d_inner + d_inner = self.hparams.get("mamba_d_ssm") data_torch = data_torch.reshape((n_group, d_inner // n_group)) if name.endswith(".A_log"): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index be3fff848b2e4..45a6a70d49630 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -589,6 +589,7 @@ class TensorNameMap: MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", + "model.layers.{bid}.mamba.out_proj", # falcon-h1 ), MODEL_TENSOR.TIME_MIX_W0: ( From 071f4b7fd86e097c444b02e33e8a14123a9116a7 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 14:37:02 +0400 Subject: [PATCH 09/98] changed precision for multipliers float 32->64 --- convert_hf_to_gguf.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 193fee8d2d0fc..6c3f66721f7cb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6642,11 +6642,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) - self.gguf_writer.add_float32("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) + self.gguf_writer.add_float64("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) ## Other params - self.gguf_writer.add_float32("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - self.gguf_writer.add_float32("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + self.gguf_writer.add_float64("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + self.gguf_writer.add_float64("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" @@ -6663,15 +6663,15 @@ def set_gguf_parameters(self): self.find_hparam(["num_attention_heads"])) # Add multipliers as metadata instead of tensors - self.gguf_writer.add_float32("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) + self.gguf_writer.add_float64("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) + self.gguf_writer.add_float64("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) + self.gguf_writer.add_float64("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) + self.gguf_writer.add_float64("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) # Add MLP multipliers if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - self.gguf_writer.add_float32("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - self.gguf_writer.add_float32("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) + self.gguf_writer.add_float64("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + self.gguf_writer.add_float64("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add has MuP flag if SSM multipliers are present if self.ssm_multipliers is not None: @@ -6681,7 +6681,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_bool("falcon-h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) self.gguf_writer.add_bool("falcon-h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) self.gguf_writer.add_bool("falcon-h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) - self.gguf_writer.add_float32("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + self.gguf_writer.add_float64("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) ###### CONVERSION LOGIC ###### From 50eadc7b336254ac23b66a678620d05012b18326 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 14:47:31 +0400 Subject: [PATCH 10/98] fixes --- convert_hf_to_gguf.py | 38 +++++++++++++++++----------------- gguf-py/gguf/tensor_mapping.py | 2 +- src/llama-arch.cpp | 2 +- src/llama-model.h | 2 +- src/llama-quant.cpp | 5 +++++ src/llama-vocab.cpp | 2 +- 6 files changed, 28 insertions(+), 23 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3566821890310..764c04f10a122 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6632,11 +6632,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) - self.gguf_writer.add_float32("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) + self.gguf_writer.add_float32("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) ## Other params - self.gguf_writer.add_float32("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - self.gguf_writer.add_float32("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + self.gguf_writer.add_float32("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + self.gguf_writer.add_float32("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" @@ -6644,34 +6644,34 @@ def set_gguf_parameters(self): # Add Falcon Mamba2 specific configuration - self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"]) - self.gguf_writer.add_uint32("falcon-h1.attention.head_dim", self.hparams["head_dim"]) - self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) - self.gguf_writer.add_uint32("falcon-h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) - self.gguf_writer.add_uint32("falcon-h1.num_key_value_heads", + self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"]) + self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"]) + self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_uint32("falcon_h1.num_key_value_heads", self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) # Add multipliers as metadata instead of tensors - self.gguf_writer.add_float32("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) + self.gguf_writer.add_float32("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) + self.gguf_writer.add_float32("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) + self.gguf_writer.add_float32("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) + self.gguf_writer.add_float32("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) # Add MLP multipliers if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - self.gguf_writer.add_float32("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - self.gguf_writer.add_float32("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) + self.gguf_writer.add_float32("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + self.gguf_writer.add_float32("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add has MuP flag if SSM multipliers are present if self.ssm_multipliers is not None: - self.gguf_writer.add_bool("falcon-h1.ssm.has_mup", True) + self.gguf_writer.add_bool("falcon_h1.ssm.has_mup", True) # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_bool("falcon-h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) - self.gguf_writer.add_bool("falcon-h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) - self.gguf_writer.add_bool("falcon-h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) - self.gguf_writer.add_float32("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) + self.gguf_writer.add_bool("falcon_h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) + self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) + self.gguf_writer.add_float32("falcon_h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index be3fff848b2e4..1ce339bd18ee2 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1177,7 +1177,7 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_MUP_VEC: ( - "model.layers.{bid}.mamba.mup_vector", # falcon-h1 + "model.layers.{bid}.mamba.mup_vector", # falcon_h1 ), MODEL_TENSOR.SSM_NORM: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 635a5bdbc9b65..4b555754eeee1 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -46,7 +46,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA2, "mamba2" }, - { LLM_ARCH_FALCON_H1, "falcon-h1" }, + { LLM_ARCH_FALCON_H1, "falcon_h1" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, diff --git a/src/llama-model.h b/src/llama-model.h index fc235cd23d4c0..8e14be82b52a0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -226,7 +226,7 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; - // falcon-h1 + // falcon_h1 struct ggml_tensor * ssm_in_b = nullptr; struct ggml_tensor * ssm_mup_vec = nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4b5713d7dd9a..2039ed5234f5d 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -315,6 +315,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } + } else if (name.find("ssm_in.weight") != std::string::npos) { + // For mamba-based models it's better to not quantize the ssm-proj layers + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + new_type = GGML_TYPE_BF16; + } } else if (name.find("attn_q.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d492515609ca3..7d2a41301ff5e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1522,7 +1522,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || - tokenizer_pre == "falcon-h1" || + tokenizer_pre == "falcon_h1" || tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; From 1415cd87823ab39244d0ccd19df235d2557e563b Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 14:49:59 +0400 Subject: [PATCH 11/98] another fix --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a3df832cd5587..857725e2663bf 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -688,7 +688,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: res = "falcon3" if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df - res = "falcon-H1" + res = "falcon_h1" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 res = "bert-bge-large" From 243e4d1a50bd73467d99f6b289b9a1826f83b94b Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 14:55:31 +0400 Subject: [PATCH 12/98] fix --- convert_hf_to_gguf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 857725e2663bf..66cbd6741f4df 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6606,6 +6606,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: for name, tensor in super().get_tensors(): if name.startswith("model.backbone") or name.startswith("model.lm_head"): name = name.removeprefix("model.") + if "ffn_pre_norm" in name: + name = name.replace("ffn_pre_norm", "ffn_norm") yield name, tensor if self.ssm_multipliers is not None: From cce35498d53eaf8b6624d65827e1d34d21f7005f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 14:58:33 +0400 Subject: [PATCH 13/98] pre-norm -> norm --- src/llama-arch.cpp | 2 +- src/llama-model.cpp | 4 ++-- src/llama-model.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 4b555754eeee1..469c9b51e875e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1068,7 +1068,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_FFN_PRE_NORM, "blk.%d.ffn_pre_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_pre_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc822331aaea2..e632fa7770c1f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4595,7 +4595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // feed forward (w/ optional biases) - layer.ffn_pre_norm = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM, i), {hidden_size}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM, i), {hidden_size}, 0); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0); @@ -14751,7 +14751,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { // feed-forward network cur = build_norm(ffn_inp, - model.layers[il].ffn_pre_norm, NULL, + model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); diff --git a/src/llama-model.h b/src/llama-model.h index 8e14be82b52a0..506fcd47897e5 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -216,7 +216,6 @@ struct llama_layer { struct ggml_tensor * layer_out_norm_b = nullptr; struct ggml_tensor * ffn_norm_exps = nullptr; struct ggml_tensor * ffn_norm_enc = nullptr; - struct ggml_tensor * ffn_pre_norm = nullptr; // ff struct ggml_tensor * ffn_gate = nullptr; // w1 From 22de62cf56a61b78ab771d5a1ddd09c4c847a480 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 15:02:14 +0400 Subject: [PATCH 14/98] fix --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 469c9b51e875e..ebae132b7ab47 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1068,7 +1068,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_pre_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, From 2fe057cc401da1593d871555e07131ad366fd1fa Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 15:04:13 +0400 Subject: [PATCH 15/98] Revert "fix" This reverts commit 243e4d1a50bd73467d99f6b289b9a1826f83b94b. --- convert_hf_to_gguf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 66cbd6741f4df..857725e2663bf 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6606,8 +6606,6 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: for name, tensor in super().get_tensors(): if name.startswith("model.backbone") or name.startswith("model.lm_head"): name = name.removeprefix("model.") - if "ffn_pre_norm" in name: - name = name.replace("ffn_pre_norm", "ffn_norm") yield name, tensor if self.ssm_multipliers is not None: From 6c7d9e26e7a49b728ec23dfd617614680d419ddf Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 15:25:59 +0400 Subject: [PATCH 16/98] fix --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ebae132b7ab47..ae6ea76fff7a3 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1068,7 +1068,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, From 15138df48f2637da9696772ded9d0c013c169946 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 15:37:40 +0400 Subject: [PATCH 17/98] small fix ffn_norm --- gguf-py/gguf/tensor_mapping.py | 2 +- src/llama-arch.cpp | 2 +- src/llama-model.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 65311cee00c00..245385dfdf201 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -286,7 +286,7 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_PRE_NORM: ( "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 - "model.layers.{bid}.pre_ff_layernorm.weight", + "model.layers.{bid}.pre_ff_layernorm.weight", ), # Post feed-forward norm diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ebae132b7ab47..ae6ea76fff7a3 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1068,7 +1068,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, - { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e632fa7770c1f..93e7dae59fe1a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4595,7 +4595,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // feed forward (w/ optional biases) - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_PRE_NORM, i), {hidden_size}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0); From 1fd0574adc4a0184951c86eb1caa52d69ac62188 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 15:50:43 +0400 Subject: [PATCH 18/98] try --- convert_hf_to_gguf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 857725e2663bf..c43633053855f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -607,7 +607,10 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + vocab_size = max( + self.hparams.get("vocab_size", len(tokenizer.vocab)), + len(tokenizer.vocab) + ) assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) From 250b4f1074da288399e71577a925aa83acf316d5 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Fri, 4 Jul 2025 15:53:47 +0400 Subject: [PATCH 19/98] mix instead of max --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c43633053855f..ef5209927f6a2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -607,7 +607,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = max( + vocab_size = min( self.hparams.get("vocab_size", len(tokenizer.vocab)), len(tokenizer.vocab) ) From 3ee798396185e53e1cbcdbacbe2004494ed6dcab Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 16:25:27 +0400 Subject: [PATCH 20/98] fix vocab size --- convert_hf_to_gguf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c43633053855f..108f12be22eb0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -607,10 +607,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = max( - self.hparams.get("vocab_size", len(tokenizer.vocab)), - len(tokenizer.vocab) - ) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) @@ -4885,6 +4882,9 @@ def set_vocab(self): pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) # pad using ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 + # if architecture is FalconH1, don't pad vocab size + if self.hparams.get("architectures", [None])[0] == "FalconH1ForCausalLM": + pad_vocab = 1 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab self.hparams["vocab_size"] = vocab_size From 9760c8bc9dfca6b4435657220c5d37a44cf63cc7 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 16:28:48 +0400 Subject: [PATCH 21/98] conflict solve --- convert_hf_to_gguf.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4aabfd01d0d97..108f12be22eb0 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -607,14 +607,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) -<<<<<<< HEAD vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) -======= - vocab_size = min( - self.hparams.get("vocab_size", len(tokenizer.vocab)), - len(tokenizer.vocab) - ) ->>>>>>> 250b4f1074da288399e71577a925aa83acf316d5 assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) From 7a25441e131e41810c3efd209d2f87c3fba7cec5 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 17:41:03 +0400 Subject: [PATCH 22/98] fixed multipliers --- convert_hf_to_gguf.py | 20 ++++++++++---------- src/llama-hparams.h | 20 ++++++++++---------- src/llama-model-loader.cpp | 1 + 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 108f12be22eb0..17feb9430c8a3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6645,11 +6645,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) - self.gguf_writer.add_float32("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) + self.gguf_writer.add_float64("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) ## Other params - self.gguf_writer.add_float32("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - self.gguf_writer.add_float32("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + self.gguf_writer.add_float64("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + self.gguf_writer.add_float64("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" @@ -6666,15 +6666,15 @@ def set_gguf_parameters(self): self.find_hparam(["num_attention_heads"])) # Add multipliers as metadata instead of tensors - self.gguf_writer.add_float32("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) - self.gguf_writer.add_float32("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) - self.gguf_writer.add_float32("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) - self.gguf_writer.add_float32("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) + self.gguf_writer.add_float64("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) + self.gguf_writer.add_float64("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) + self.gguf_writer.add_float64("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) + self.gguf_writer.add_float64("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) # Add MLP multipliers if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - self.gguf_writer.add_float32("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - self.gguf_writer.add_float32("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) + self.gguf_writer.add_float64("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + self.gguf_writer.add_float64("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add has MuP flag if SSM multipliers are present if self.ssm_multipliers is not None: @@ -6684,7 +6684,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) self.gguf_writer.add_bool("falcon_h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) - self.gguf_writer.add_float32("falcon_h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + self.gguf_writer.add_float64("falcon_h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) ###### CONVERSION LOGIC ###### diff --git a/src/llama-hparams.h b/src/llama-hparams.h index d671edaa4d04d..2142a74aafd20 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -122,17 +122,17 @@ struct llama_hparams { bool mamba_use_mlp = false; bool mamba_norm_before_gate = false; bool mamba_rms_norm = false; - float attention_in_multiplier = 1.0f; - float attention_out_multiplier = 1.0f; - float ssm_in_multiplier = 1.0f; - float ssm_out_multiplier = 1.0f; - float mlp_gate_multiplier = 1.0f; - float mlp_down_multiplier = 1.0f; - float key_multiplier = 1.0f; - float lm_head_multiplier = 1.0f; - float rope_theta = 10000.0f; + double attention_in_multiplier = 1.0; + double attention_out_multiplier = 1.0; + double ssm_in_multiplier = 1.0; + double ssm_out_multiplier = 1.0; + double mlp_gate_multiplier = 1.0; + double mlp_down_multiplier = 1.0; + double key_multiplier = 1.0; + double lm_head_multiplier = 1.0; + double rope_theta = 10000.0; + double embedding_multiplier = 1.0; bool ssm_has_mup = false; - float embedding_multiplier = 1.0f; uint32_t vocab_size = 0; uint32_t intermediate_size = 0; float mamba_expand = 0.0f; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..43079c32a6388 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -400,6 +400,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key (enum llm_kv kid, bool & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, float & result, bool required); + template bool llama_model_loader::get_key (enum llm_kv kid, double & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, uint32_t & result, bool required); template bool llama_model_loader::get_key(enum llm_kv kid, std::string & result, bool required); From 280dd2dcb7434ad084578b60457854fd768ebf5f Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 10:25:57 +0400 Subject: [PATCH 23/98] falcon-h1 specefic vocab resolved --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 17feb9430c8a3..96cc8d28c7925 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4882,9 +4882,6 @@ def set_vocab(self): pad_vocab = self.hparams.get("pad_vocab_size_multiple", 16) # pad using ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 - # if architecture is FalconH1, don't pad vocab size - if self.hparams.get("architectures", [None])[0] == "FalconH1ForCausalLM": - pad_vocab = 1 vocab_size = -(vocab_size // -pad_vocab) * pad_vocab self.hparams["vocab_size"] = vocab_size @@ -6590,6 +6587,9 @@ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: keys = list(keys) + prefixed return super().find_hparam(keys, *args, **kwargs) + def set_vocab(self): + self._set_vocab_gpt2() + def _generate_mup_vector(self, block_id: int) -> torch.Tensor: zxbcdt_multipliers = self.hparams["ssm_multipliers"] intermediate_size = self.hparams["mamba_d_ssm"] From c56ec07a9ac2bebdf868e8ce052299b3ec59ca4b Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 10:34:46 +0400 Subject: [PATCH 24/98] read arch from gguf.MODEL_ARCH --- convert_hf_to_gguf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 96cc8d28c7925..89bfc33c8ce0a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4909,8 +4909,7 @@ def set_gguf_parameters(self): # Fail early for models which don't have a block expansion factor of 2 # TODO: does this really matter? # skip the assertion for FalconH1 Model - architectures = self.hparams.get("architectures") - if architectures is None or architectures[0] != "FalconH1ForCausalLM": + if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: assert d_inner == 2 * d_model assert d_inner % head_dim == 0 From c4af0f3ca531bc6e295efae66aea5a8b6ae149de Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 11:17:31 +0400 Subject: [PATCH 25/98] mamba_d_ssm added to d_inner find_hparam --- convert_hf_to_gguf.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 89bfc33c8ce0a..316d9b59d3a12 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4899,7 +4899,7 @@ def set_vocab(self): def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 head_dim = self.find_hparam(["head_dim"], optional=True) or 64 n_group = self.find_hparam(["n_groups"], optional=True) or 1 @@ -4948,12 +4948,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.reshape((*data_torch.shape, 1)) elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid): d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model n_group = self.hparams.get("n_groups", 1) - architectures = self.hparams.get("architectures") - if architectures is not None and architectures[0] == "FalconH1ForCausalLM": - # FalconH1F has a different d_inner - d_inner = self.hparams.get("mamba_d_ssm") data_torch = data_torch.reshape((n_group, d_inner // n_group)) if name.endswith(".A_log"): From 53304c84dbd827c87f4e4566094e6203d40c7d3a Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 11:18:14 +0400 Subject: [PATCH 26/98] remove unused functions from gguf_writer.py --- gguf-py/gguf/gguf_writer.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 7043fa375956c..b1d9b6f1985a3 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -870,12 +870,6 @@ def add_ssm_dt_b_c_rms(self, value: bool) -> None: def add_ssm_head_dim(self, value: int) -> None: self.add_uint32(Keys.SSM.HEAD_DIM.format(arch=self.arch), value) - def add_attn_head_count(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT.format(arch=self.arch), count) - - def add_key_value_head_count(self, count: int) -> None: - self.add_uint32(Keys.Attention.HEAD_COUNT_KV.format(arch=self.arch), count) - def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) From 441d8d66bd0dfda9375795c26608fc8b8d26e53e Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 12:00:57 +0400 Subject: [PATCH 27/98] override modify_tensors instead of get_tensors --- convert_hf_to_gguf.py | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 316d9b59d3a12..46df230dd3b28 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6600,23 +6600,19 @@ def _generate_mup_vector(self, block_id: int) -> torch.Tensor: return mup_vector - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - for name, tensor in super().get_tensors(): - if name.startswith("model.backbone") or name.startswith("model.lm_head"): - name = name.removeprefix("model.") - yield name, tensor - - if self.ssm_multipliers is not None: - # Insert MUP vector after mamba.dt_bias - if "mamba.dt_bias" in name: - block_match = re.search(r"(?:model\.layers\.)?(\d+)\.mamba\.dt_bias", name) - if block_match: - block_id = int(block_match.group(1)) - # Generate MUP vector with correct name format - mup_tensor = self._generate_mup_vector(block_id) - mup_name = f"blk.{block_id}.ssm_mup_vec" - logger.debug(f"Inserting MUP vector for block {block_id}: {mup_name}") - yield mup_name, mup_tensor + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + tensors = list(super().modify_tensors(data_torch, name, bid)) + + if self.ssm_multipliers is not None and "mamba.dt_bias" in name: + block_match = re.search(r"(?:model\.layers\.)?(\d+)\.mamba\.dt_bias", name) + if block_match: + block_id = int(block_match.group(1)) + mup_tensor = self._generate_mup_vector(block_id) + mup_name = f"blk.{block_id}.ssm_mup_vec" + logger.debug(f"Inserting MUP vector for block {block_id}: {mup_name}") + tensors.append((self.map_tensor_name(mup_name), mup_tensor)) + + return tensors def set_gguf_parameters(self): super().set_gguf_parameters() From 6c39e775dd468ce2a845f580656b347efb8f6542 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 7 Jul 2025 10:56:49 +0200 Subject: [PATCH 28/98] fix conversion and d_inner --- convert_hf_to_gguf.py | 7 ++++++- src/llama-model.cpp | 3 +-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 17feb9430c8a3..ec58457b6b184 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -686,7 +686,12 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base res = "falcon3" - if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + if ( + chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86" or + chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896" or + chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b" or + chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6" + ): # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df res = "falcon_h1" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 93e7dae59fe1a..07ca68ea0053f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14803,10 +14803,9 @@ struct llm_build_falcon_h1 : public llm_graph_context { const int64_t d_conv = hparams.ssm_d_conv; const int64_t d_ssm = hparams.ssm_mamba_d_ssm; - const int64_t d_inner = hparams.ssm_d_inner; const int64_t d_state = hparams.ssm_d_state; const int64_t n_head = hparams.ssm_dt_rank; - const int64_t head_dim = hparams.ssm_head_dim == 0 ? d_inner / n_head : hparams.ssm_head_dim; + const int64_t head_dim = hparams.ssm_head_dim == 0 ? d_ssm / n_head : hparams.ssm_head_dim; const int64_t n_group = hparams.ssm_n_group; const int64_t n_seqs = ubatch.n_seqs; From 8c50893820a6afb1b5a01f61ca0864a08fc59d5e Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 14:10:45 +0400 Subject: [PATCH 29/98] added some cb functions for debugging puposes --- src/llama-model.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 93e7dae59fe1a..2dd828171f8ce 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14829,12 +14829,14 @@ struct llm_build_falcon_h1 : public llm_graph_context { // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs} ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); + cb(zxBCdt, "zxBCdt", il); // check if the models has ssm_multipliers (MuP) if (hparams.ssm_has_mup) { struct ggml_tensor * mup_vec = model.layers[il].ssm_mup_vec; cur = ggml_mul(ctx0, zxBCdt, mup_vec); + cb(cur, "ssm_mup", il); zxBCdt = cur; } From 49d74209640b2f75b69b912613212bdf0cd01474 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 14:18:48 +0400 Subject: [PATCH 30/98] inp_out_ids moved outside of layers loop --- src/llama-model.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2dd828171f8ce..84188d16cf936 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14676,6 +14676,8 @@ struct llm_build_falcon_h1 : public llm_graph_context { const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + ggml_tensor * inp_out_ids = build_inp_out_ids(); + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; @@ -14740,8 +14742,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { cb(cur, "layer_out", il); if (il == n_layer - 1) { - // skip computing output for unused tokens - ggml_tensor * inp_out_ids = build_inp_out_ids(); cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -14831,7 +14831,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); cb(zxBCdt, "zxBCdt", il); - // check if the models has ssm_multipliers (MuP) if (hparams.ssm_has_mup) { struct ggml_tensor * mup_vec = model.layers[il].ssm_mup_vec; @@ -14850,7 +14849,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs} ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0); - // copy last (d_conv - 1) columns back into the state cache ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_ssm + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); @@ -14889,7 +14887,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // {n_head, n_seq_tokens, n_seqs} dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b); - ggml_tensor * A = model.layers[il].ssm_a; // use the states and the indices provided by build_rs From 97011d7a1fbb1fdd3cee46c1eaa4bfb3d70ba6ad Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 14:25:32 +0400 Subject: [PATCH 31/98] mup_vec create as float64 --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 46df230dd3b28..5ef225d247395 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6591,7 +6591,7 @@ def _generate_mup_vector(self, block_id: int) -> torch.Tensor: groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] vector_shape = (2 * intermediate_size + 2 * groups_time_state_size + self.hparams["mamba_n_heads"]) - mup_vector = torch.ones(1, 1, vector_shape) + mup_vector = torch.ones(1, 1, vector_shape, dtype=torch.float64) mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0] mup_vector[:, :, intermediate_size:2 * intermediate_size] *= zxbcdt_multipliers[1] mup_vector[:, :, 2 * intermediate_size:2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2] From 286e1fa56923a387a2015d9f31825c67c75c16fd Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 14:36:51 +0400 Subject: [PATCH 32/98] fix rope_theta --- convert_hf_to_gguf.py | 2 +- src/llama-arch.cpp | 1 - src/llama-arch.h | 1 - src/llama-model.cpp | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5ef225d247395..65017ab705c97 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6675,7 +6675,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) self.gguf_writer.add_bool("falcon_h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) - self.gguf_writer.add_float64("falcon_h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) ###### CONVERSION LOGIC ###### diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ae6ea76fff7a3..7fb81cfdc82aa 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -237,7 +237,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_FALCON_H1_SSM_HAS_MUP, "%s.ssm.has_mup" }, { LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, "%s.mamba_norm_before_gate" }, { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, - { LLM_KV_FALCON_H1_ROPE_THETA, "%s.rope_theta" }, { LLM_KV_FALCON_H1_KEY_MULTIPLIER, "%s.key_multiplier" }, { LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, "%s.lm_head_multiplier" }, { LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index c05cb851978bd..4ad1beb2455ef 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -172,7 +172,6 @@ enum llm_kv { LLM_KV_FALCON_H1_SSM_HAS_MUP, LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, - LLM_KV_FALCON_H1_ROPE_THETA, LLM_KV_FALCON_H1_KEY_MULTIPLIER, LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 84188d16cf936..fe7ba4f9ac5a4 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1577,7 +1577,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_FALCON_H1_SSM_HAS_MUP, hparams.ssm_has_mup); ml.get_key(LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, hparams.mamba_norm_before_gate); ml.get_key(LLM_KV_FALCON_H1_MAMBA_RMS_NORM, hparams.mamba_rms_norm); - ml.get_key(LLM_KV_FALCON_H1_ROPE_THETA, hparams.rope_theta); ml.get_key(LLM_KV_FALCON_H1_KEY_MULTIPLIER, hparams.key_multiplier); ml.get_key(LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, hparams.lm_head_multiplier); ml.get_key(LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, hparams.embedding_multiplier); From a9f3a63dc10b05dba5b7f56a820fd91e3c35deca Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 15:00:25 +0400 Subject: [PATCH 33/98] injected mup --- convert_hf_to_gguf.py | 69 ++++++++++++++++++++++------------ gguf-py/gguf/constants.py | 3 -- gguf-py/gguf/tensor_mapping.py | 6 +-- src/llama-arch.cpp | 12 ------ src/llama-arch.h | 11 ------ src/llama-graph.cpp | 7 ---- src/llama-hparams.h | 10 ----- src/llama-model.cpp | 29 -------------- src/llama-model.h | 1 - 9 files changed, 45 insertions(+), 103 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bdf7363964895..66073067e7b53 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6576,6 +6576,7 @@ def __init__(self, *args, **kwargs): self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) self.intermediate_size = self.find_hparam(["intermediate_size"]) + self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True) def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: prefixed = [] @@ -6607,16 +6608,38 @@ def _generate_mup_vector(self, block_id: int) -> torch.Tensor: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: tensors = list(super().modify_tensors(data_torch, name, bid)) - - if self.ssm_multipliers is not None and "mamba.dt_bias" in name: - block_match = re.search(r"(?:model\.layers\.)?(\d+)\.mamba\.dt_bias", name) - if block_match: - block_id = int(block_match.group(1)) - mup_tensor = self._generate_mup_vector(block_id) - mup_name = f"blk.{block_id}.ssm_mup_vec" - logger.debug(f"Inserting MUP vector for block {block_id}: {mup_name}") - tensors.append((self.map_tensor_name(mup_name), mup_tensor)) - + tensor = tensors[0][1] + + if "down_proj" in name: + tensor = tensor * self.mlp_multipliers[1] + elif "gate_proj" in name: + tensor = tensor * self.mlp_multipliers[0] + elif "k_proj" in name: + tensor = tensor * self.key_multiplier * self.attention_in_multiplier + elif "q_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "v_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "o_proj" in name: + tensor = tensor * self.attention_out_multiplier + elif "out_proj" in name: + tensor = tensor * self.ssm_out_multiplier + elif "in_proj" in name: + tensor = tensor * self.ssm_in_multiplier + zxbcdt_multipliers = self.hparams["ssm_multipliers"] + intermediate_size = self.hparams["mamba_d_ssm"] + groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] + tensor[:intermediate_size, :] *= zxbcdt_multipliers[0] + tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1] + tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2] + tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3] + tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4] + elif "lm_head" in name: + tensor = tensor * self.hparams["lm_head_multiplier"] + elif "embed_tokens" in name: + tensor = tensor * self.hparams["embedding_multiplier"] + + tensors = [(tensors[0][0], tensor)] return tensors def set_gguf_parameters(self): @@ -6644,8 +6667,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_float64("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) ## Other params - self.gguf_writer.add_float64("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - self.gguf_writer.add_float64("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + # self.gguf_writer.add_float64("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + # self.gguf_writer.add_float64("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" @@ -6661,20 +6684,16 @@ def set_gguf_parameters(self): self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) - # Add multipliers as metadata instead of tensors - self.gguf_writer.add_float64("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) - self.gguf_writer.add_float64("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) - self.gguf_writer.add_float64("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) - self.gguf_writer.add_float64("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) - - # Add MLP multipliers - if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - self.gguf_writer.add_float64("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - self.gguf_writer.add_float64("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) + # # Add multipliers as metadata instead of tensors + # self.gguf_writer.add_float64("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) + # self.gguf_writer.add_float64("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) + # self.gguf_writer.add_float64("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) + # self.gguf_writer.add_float64("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) - # Add has MuP flag if SSM multipliers are present - if self.ssm_multipliers is not None: - self.gguf_writer.add_bool("falcon_h1.ssm.has_mup", True) + # # Add MLP multipliers + # if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: + # self.gguf_writer.add_float64("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + # self.gguf_writer.add_float64("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add any other Falcon Mamba2 specific configuration self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 08223927a7cbb..84a91e82ca154 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -527,7 +527,6 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() - SSM_MUP_VEC = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() @@ -740,7 +739,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm", MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", - MODEL_TENSOR.SSM_MUP_VEC: "blk.{bid}.ssm_mup_vec", MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -2230,7 +2228,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_OUT, # Output projection # SSM components (Mamba2 specific) - MODEL_TENSOR.SSM_MUP_VEC, # Mup vector MODEL_TENSOR.SSM_IN, # Input projection for SSM MODEL_TENSOR.SSM_CONV1D, # Convolution layer MODEL_TENSOR.SSM_DT, # Delta time projection diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 245385dfdf201..ff3f273bd5562 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1176,11 +1176,7 @@ class TensorNameMap: MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( "resampler.attn.out_proj", ), - - MODEL_TENSOR.SSM_MUP_VEC: ( - "model.layers.{bid}.mamba.mup_vector", # falcon_h1 - ), - + MODEL_TENSOR.SSM_NORM: ( "model.layers.{bid}.mamba.norm", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 7fb81cfdc82aa..b43911eb30e58 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -228,18 +228,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_MAMBA_D_SSM, "%s.ssm.mamba_d_ssm" }, { LLM_KV_FALCON_H1_USE_MLP, "%s.mamba_use_mlp" }, - { LLM_KV_FALCON_H1_ATTENTION_IN_MULTIPLIER, "%s.attention_in_multiplier" }, - { LLM_KV_FALCON_H1_ATTENTION_OUT_MULTIPLIER, "%s.attention_out_multiplier" }, - { LLM_KV_FALCON_H1_SSM_IN_MULTIPLIER, "%s.ssm_in_multiplier" }, - { LLM_KV_FALCON_H1_SSM_OUT_MULTIPLIER, "%s.ssm_out_multiplier" }, - { LLM_KV_FALCON_H1_MLP_GATE_MULTIPLIER, "%s.mlp_gate_multiplier" }, - { LLM_KV_FALCON_H1_MLP_DOWN_MULTIPLIER, "%s.mlp_down_multiplier" }, - { LLM_KV_FALCON_H1_SSM_HAS_MUP, "%s.ssm.has_mup" }, { LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, "%s.mamba_norm_before_gate" }, { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, - { LLM_KV_FALCON_H1_KEY_MULTIPLIER, "%s.key_multiplier" }, - { LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, "%s.lm_head_multiplier" }, - { LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, "%s.embedding_multiplier" }, { LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, "%s.ssm.mamba_chunk_size" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, @@ -1062,7 +1052,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, - { LLM_TENSOR_SSM_MUP_VEC, "blk.%d.ssm_mup_vec" }, { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, @@ -1832,7 +1821,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}}, {LLM_TENSOR_SSM_D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_SSM_MUP_VEC, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 4ad1beb2455ef..80af4223616cc 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -163,18 +163,8 @@ enum llm_kv { LLM_KV_MAMBA_D_SSM, LLM_KV_N_LAYER, LLM_KV_FALCON_H1_USE_MLP, - LLM_KV_FALCON_H1_ATTENTION_IN_MULTIPLIER, - LLM_KV_FALCON_H1_ATTENTION_OUT_MULTIPLIER, - LLM_KV_FALCON_H1_SSM_IN_MULTIPLIER, - LLM_KV_FALCON_H1_SSM_OUT_MULTIPLIER, - LLM_KV_FALCON_H1_MLP_GATE_MULTIPLIER, - LLM_KV_FALCON_H1_MLP_DOWN_MULTIPLIER, - LLM_KV_FALCON_H1_SSM_HAS_MUP, LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, - LLM_KV_FALCON_H1_KEY_MULTIPLIER, - LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, - LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, LLM_KV_ROPE_DIMENSION_COUNT, @@ -410,7 +400,6 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, - LLM_TENSOR_SSM_MUP_VEC, LLM_TENSOR_FFN_PRE_NORM, LLM_TENSOR_FINAL_NORM, }; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index eea8207c143cf..4443420132f2b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -545,10 +545,6 @@ ggml_tensor * llm_graph_context::build_ffn( case LLM_FFN_PAR: { cur = build_lora_mm(gate, cur); - if (arch == LLM_ARCH_FALCON_H1) { - cur = ggml_scale(ctx0, cur, hparams.mlp_gate_multiplier); - } - cb(cur, "ffn_gate", il); } break; } @@ -635,9 +631,6 @@ ggml_tensor * llm_graph_context::build_ffn( // GLM4 seems to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } - if (arch == LLM_ARCH_FALCON_H1) { - cur = ggml_scale(ctx0, cur, hparams.mlp_down_multiplier); - } } if (down_b) { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 2142a74aafd20..429eaf0482d0b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -122,17 +122,7 @@ struct llama_hparams { bool mamba_use_mlp = false; bool mamba_norm_before_gate = false; bool mamba_rms_norm = false; - double attention_in_multiplier = 1.0; - double attention_out_multiplier = 1.0; - double ssm_in_multiplier = 1.0; - double ssm_out_multiplier = 1.0; - double mlp_gate_multiplier = 1.0; - double mlp_down_multiplier = 1.0; - double key_multiplier = 1.0; - double lm_head_multiplier = 1.0; double rope_theta = 10000.0; - double embedding_multiplier = 1.0; - bool ssm_has_mup = false; uint32_t vocab_size = 0; uint32_t intermediate_size = 0; float mamba_expand = 0.0f; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 661256df646cb..bf6613a80edf1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1568,18 +1568,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { // Falcon-H1 parameters ml.get_key(LLM_KV_ATTN_HEAD_DIM, hparams.attn_head_dim); ml.get_key(LLM_KV_FALCON_H1_USE_MLP, hparams.mamba_use_mlp); - ml.get_key(LLM_KV_FALCON_H1_ATTENTION_IN_MULTIPLIER, hparams.attention_in_multiplier); - ml.get_key(LLM_KV_FALCON_H1_ATTENTION_OUT_MULTIPLIER, hparams.attention_out_multiplier); - ml.get_key(LLM_KV_FALCON_H1_SSM_IN_MULTIPLIER, hparams.ssm_in_multiplier); - ml.get_key(LLM_KV_FALCON_H1_SSM_OUT_MULTIPLIER, hparams.ssm_out_multiplier); - ml.get_key(LLM_KV_FALCON_H1_MLP_GATE_MULTIPLIER, hparams.mlp_gate_multiplier); - ml.get_key(LLM_KV_FALCON_H1_MLP_DOWN_MULTIPLIER, hparams.mlp_down_multiplier); - ml.get_key(LLM_KV_FALCON_H1_SSM_HAS_MUP, hparams.ssm_has_mup); ml.get_key(LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, hparams.mamba_norm_before_gate); ml.get_key(LLM_KV_FALCON_H1_MAMBA_RMS_NORM, hparams.mamba_rms_norm); - ml.get_key(LLM_KV_FALCON_H1_KEY_MULTIPLIER, hparams.key_multiplier); - ml.get_key(LLM_KV_FALCON_H1_LM_HEAD_MULTIPLIER, hparams.lm_head_multiplier); - ml.get_key(LLM_KV_FALCON_H1_EMBEDDING_MULTIPLIER, hparams.embedding_multiplier); std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); @@ -4570,9 +4560,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // no "weight" suffix for these layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); - if (hparams.ssm_has_mup == true) { - layer.ssm_mup_vec = create_tensor(tn(LLM_TENSOR_SSM_MUP_VEC, i), {2*ssm_intermediate_size + 2*ssm_n_groups*ssm_state_size + ssm_num_heads}, 0); - } // ssm_norm if (hparams.mamba_rms_norm == true) { layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); @@ -14665,7 +14652,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * inpL; inpL = build_inp_embd(model.tok_embd); - inpL = ggml_scale(ctx0, inpL, hparams.embedding_multiplier); // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); @@ -14684,7 +14670,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - cur = ggml_scale(ctx0, cur, hparams.attention_in_multiplier); // self-attention ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -14699,8 +14684,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Kcur = ggml_scale(ctx0, Kcur, hparams.key_multiplier); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_ext( @@ -14721,18 +14704,15 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * attn_out = build_attn(inp, gf, model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); - attn_out = ggml_scale(ctx0, attn_out, hparams.attention_out_multiplier); cb(attn_out, "attn_out", il); cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); // Mamba2 layer - cur = ggml_scale(ctx0, cur, hparams.ssm_in_multiplier); cb(cur, "ssm_in", il); ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il); - ssm_out = ggml_scale(ctx0, ssm_out, hparams.ssm_out_multiplier); cb(ssm_out, "ssm_out", il); // // Aggregation @@ -14782,7 +14762,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { // lm_head cur = build_lora_mm(model.output, cur); - cur = ggml_scale(ctx0, cur, hparams.lm_head_multiplier); cb(cur, "result_output", -1); res->t_logits = cur; @@ -14829,14 +14808,6 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); cb(zxBCdt, "zxBCdt", il); - // check if the models has ssm_multipliers (MuP) - if (hparams.ssm_has_mup) { - struct ggml_tensor * mup_vec = model.layers[il].ssm_mup_vec; - cur = ggml_mul(ctx0, zxBCdt, mup_vec); - cb(cur, "ssm_mup", il); - zxBCdt = cur; - } - // split the above in three ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0); ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_ssm + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_ssm*ggml_element_size(zxBCdt)); diff --git a/src/llama-model.h b/src/llama-model.h index 506fcd47897e5..1f089ebd2e128 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -227,7 +227,6 @@ struct llama_layer { // falcon_h1 struct ggml_tensor * ssm_in_b = nullptr; - struct ggml_tensor * ssm_mup_vec = nullptr; // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; From e96cc733908c2c9c98fe2f3b105671adeb515ab6 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 15:13:06 +0400 Subject: [PATCH 34/98] clean ups --- convert_hf_to_gguf.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 66073067e7b53..48f52b0e269b2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6666,10 +6666,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_value_length(self.hparams["head_dim"]) self.gguf_writer.add_float64("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) - ## Other params - # self.gguf_writer.add_float64("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - # self.gguf_writer.add_float64("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) - ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" @@ -6684,16 +6680,6 @@ def set_gguf_parameters(self): self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) - # # Add multipliers as metadata instead of tensors - # self.gguf_writer.add_float64("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) - # self.gguf_writer.add_float64("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) - # self.gguf_writer.add_float64("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) - # self.gguf_writer.add_float64("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) - - # # Add MLP multipliers - # if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - # self.gguf_writer.add_float64("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - # self.gguf_writer.add_float64("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add any other Falcon Mamba2 specific configuration self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) From 0ad3502839bbc9577e3e1a4d16f833ab5c6df446 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 15:26:46 +0400 Subject: [PATCH 35/98] rm extra space --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b43911eb30e58..b6b9c40a0e3f7 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -119,7 +119,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_LOGIT_SCALE, "%s.logit_scale" }, { LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" }, { LLM_KV_ATTN_LOGIT_SOFTCAPPING, "%s.attn_logit_softcapping" }, - { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, + { LLM_KV_FINAL_LOGIT_SOFTCAPPING, "%s.final_logit_softcapping" }, { LLM_KV_SWIN_NORM, "%s.swin_norm" }, { LLM_KV_RESCALE_EVERY_N_LAYERS, "%s.rescale_every_n_layers" }, { LLM_KV_TIME_MIX_EXTRA_DIM, "%s.time_mix_extra_dim" }, From 53446f7e42da6ce56e94a64013b6845a35dd76ce Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 15:29:56 +0400 Subject: [PATCH 36/98] rm unused MAMBA_CHUNK_SIZE --- convert_hf_to_gguf.py | 1 - src/llama-arch.h | 1 - src/llama-model.cpp | 1 - 3 files changed, 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 48f52b0e269b2..94527c14fa889 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6672,7 +6672,6 @@ def set_gguf_parameters(self): # Add Falcon Mamba2 specific configuration - self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"]) self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"]) self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) diff --git a/src/llama-arch.h b/src/llama-arch.h index 80af4223616cc..1dcd135ff725e 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -165,7 +165,6 @@ enum llm_kv { LLM_KV_FALCON_H1_USE_MLP, LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, - LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bf6613a80edf1..d5b080b3c2cac 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1563,7 +1563,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); ml.get_key(LLM_KV_SSM_HEAD_DIM, hparams.ssm_head_dim); - ml.get_key(LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, hparams.chunk_size); // Falcon-H1 parameters ml.get_key(LLM_KV_ATTN_HEAD_DIM, hparams.attn_head_dim); From ae937f442c8ba6d5bafc2c7c336cb70ac5e21cc0 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 16:57:36 +0400 Subject: [PATCH 37/98] rm unused key --- src/llama-arch.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b6b9c40a0e3f7..d78d407a03355 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -230,7 +230,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_FALCON_H1_USE_MLP, "%s.mamba_use_mlp" }, { LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, "%s.mamba_norm_before_gate" }, { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, - { LLM_KV_FALCON_H1_MAMBA_CHUNK_SIZE, "%s.ssm.mamba_chunk_size" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, From b6df0a49d5189bb31cd4716ab731a9ddb7dd57ab Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 16:57:52 +0400 Subject: [PATCH 38/98] add bos False --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 94527c14fa889..f9e279b8f3180 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6649,6 +6649,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_add_bos_token(False) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) ## Mamba mixer params ## From 935d46fab0232a8eab7bc8c2b4cfb7f81e9671ef Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 17:01:54 +0400 Subject: [PATCH 39/98] changed ROPE_TYPE --- src/llama-model.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d5b080b3c2cac..ee3b2a2c5f40a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14687,18 +14687,18 @@ struct llm_build_falcon_h1 : public llm_graph_context { Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, nullptr, - n_rot, 0, n_ctx_orig, freq_base, freq_scale, + n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, nullptr, - n_rot, 0, n_ctx_orig, freq_base, freq_scale, + n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); + cb(Qcur, "Qcur-post-rope", il); + cb(Kcur, "Kcur-post-rope", il); + cb(Vcur, "Vcur-post-rope", il); ggml_tensor * attn_out = build_attn(inp, gf, model.layers[il].wo, NULL, @@ -15577,11 +15577,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_NEO_BERT: case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: - case LLM_ARCH_FALCON_H1: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: + case LLM_ARCH_FALCON_H1: case LLM_ARCH_GROK: case LLM_ARCH_DBRX: case LLM_ARCH_BERT: From 624699c53f986a4e5cfeb3dba8f621c6d7a58dfc Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 17:20:24 +0400 Subject: [PATCH 40/98] cleaning debugging stuff --- src/llama-context.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e319beaa98ee2..0eabc76ddf601 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2220,14 +2220,8 @@ llama_context * llama_init_from_model( return nullptr; } - // try { auto * ctx = new llama_context(*model, params); return ctx; - // } catch (const std::exception & err) { - // LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what()); - // } - - // return nullptr; } // deprecated From 042e5ff90ba7a580715e04d6f32bdf7f061e3361 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Mon, 7 Jul 2025 17:21:54 +0400 Subject: [PATCH 41/98] cleaning debug quant --- src/llama-quant.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2039ed5234f5d..f4b5713d7dd9a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -315,11 +315,6 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } - } else if (name.find("ssm_in.weight") != std::string::npos) { - // For mamba-based models it's better to not quantize the ssm-proj layers - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { - new_type = GGML_TYPE_BF16; - } } else if (name.find("attn_q.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; From f74e266f0467054c035e1b9597bdb526f2a9a1d6 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 17:23:47 +0400 Subject: [PATCH 42/98] fix comment --- src/llama-context.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 0eabc76ddf601..06e93b19cbf40 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2220,8 +2220,14 @@ llama_context * llama_init_from_model( return nullptr; } - auto * ctx = new llama_context(*model, params); - return ctx; + try { + auto * ctx = new llama_context(*model, params); + return ctx; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: failed to initialize the context: %s\n", __func__, err.what()); + } + + return nullptr; } // deprecated From 632861e6c1c21d28b362e94f90d2ab30b920989d Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 17:27:34 +0400 Subject: [PATCH 43/98] some cleanups --- convert_hf_to_gguf.py | 2 -- src/llama-arch.cpp | 2 -- src/llama-arch.h | 2 -- src/llama-hparams.h | 2 -- src/llama-model.cpp | 2 -- 5 files changed, 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f9e279b8f3180..09f6cd173c657 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6682,8 +6682,6 @@ def set_gguf_parameters(self): # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) - self.gguf_writer.add_bool("falcon_h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index d78d407a03355..fce25a5064ff4 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -227,8 +227,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SSM_HEAD_DIM, "%s.ssm.head_dim" }, { LLM_KV_MAMBA_D_SSM, "%s.ssm.mamba_d_ssm" }, - { LLM_KV_FALCON_H1_USE_MLP, "%s.mamba_use_mlp" }, - { LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, "%s.mamba_norm_before_gate" }, { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 1dcd135ff725e..ac7bc39166791 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -162,8 +162,6 @@ enum llm_kv { LLM_KV_SSM_HEAD_DIM, LLM_KV_MAMBA_D_SSM, LLM_KV_N_LAYER, - LLM_KV_FALCON_H1_USE_MLP, - LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, LLM_KV_ROPE_DIMENSION_COUNT, diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 429eaf0482d0b..556c8a056fe88 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -119,8 +119,6 @@ struct llama_hparams { uint32_t ssm_mamba_d_ssm = 0; uint32_t attn_head_dim = 0; - bool mamba_use_mlp = false; - bool mamba_norm_before_gate = false; bool mamba_rms_norm = false; double rope_theta = 10000.0; uint32_t vocab_size = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ee3b2a2c5f40a..0e9da3c410dcf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1566,8 +1566,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { // Falcon-H1 parameters ml.get_key(LLM_KV_ATTN_HEAD_DIM, hparams.attn_head_dim); - ml.get_key(LLM_KV_FALCON_H1_USE_MLP, hparams.mamba_use_mlp); - ml.get_key(LLM_KV_FALCON_H1_MAMBA_NORM_BEFORE_GATE, hparams.mamba_norm_before_gate); ml.get_key(LLM_KV_FALCON_H1_MAMBA_RMS_NORM, hparams.mamba_rms_norm); std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); From 084873c2157af5c46acb593b2ebea62fce6ac947 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 17:28:08 +0400 Subject: [PATCH 44/98] some cleanups --- src/llama-arch.cpp | 1 - src/llama-arch.h | 1 - 2 files changed, 2 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fce25a5064ff4..fd269cc027957 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1850,7 +1850,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_KV_A_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ATTN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_FFN_SUB_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_FFN_PRE_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_CROSS_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_DEC_FFN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index ac7bc39166791..a14daf0ede8c3 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -397,7 +397,6 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, - LLM_TENSOR_FFN_PRE_NORM, LLM_TENSOR_FINAL_NORM, }; From fd203302aa79a759774b507313ddf16bdf6cdf89 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 7 Jul 2025 17:29:50 +0400 Subject: [PATCH 45/98] Update src/llama-model-loader.cpp --- src/llama-model-loader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 43079c32a6388..a2eed670acb63 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -400,7 +400,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_key (enum llm_kv kid, bool & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, float & result, bool required); - template bool llama_model_loader::get_key (enum llm_kv kid, double & result, bool required); + template bool llama_model_loader::get_key (enum llm_kv kid, double & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, uint32_t & result, bool required); template bool llama_model_loader::get_key(enum llm_kv kid, std::string & result, bool required); From 68cb7845e9e46a04caae0d03b385494d8d59a202 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 17:34:20 +0400 Subject: [PATCH 46/98] more cleanups --- src/llama-arch.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fd269cc027957..ce24a6abfbfbc 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -218,12 +218,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, - { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, - { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, - { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, - { LLM_KV_SSM_DT_B_C_RMS, "%s.ssm.dt_b_c_rms" }, - { LLM_KV_SSM_GROUP_COUNT, "%s.ssm.group_count" }, { LLM_KV_SSM_HEAD_DIM, "%s.ssm.head_dim" }, { LLM_KV_MAMBA_D_SSM, "%s.ssm.mamba_d_ssm" }, From d2f46f18ac2399787cd98c7b1ddc5164eb94ae54 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Mon, 7 Jul 2025 17:36:22 +0400 Subject: [PATCH 47/98] moe cleanuips --- src/llama-hparams.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 556c8a056fe88..dd49d9bfc426b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -120,14 +120,12 @@ struct llama_hparams { uint32_t attn_head_dim = 0; bool mamba_rms_norm = false; - double rope_theta = 10000.0; uint32_t vocab_size = 0; uint32_t intermediate_size = 0; - float mamba_expand = 0.0f; + float mamba_expand = 0.0f; bool ssm_rms_norm = false; bool ssm_conv_bias = false; bool ssm_proj_bias = false; - uint32_t chunk_size = 0; // for hybrid state space models std::array recurrent_layer_arr; From 7d7da0b37e1a30c85a9c2e9f07dd5ddc0363afd1 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 10:18:43 +0400 Subject: [PATCH 48/98] d_ssm -> d_inner; --- convert_hf_to_gguf.py | 3 ++- src/llama-arch.cpp | 1 - src/llama-arch.h | 1 - src/llama-hparams.cpp | 9 ++------- src/llama-hparams.h | 1 - src/llama-model.cpp | 31 +++++++++++++++---------------- 6 files changed, 19 insertions(+), 27 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 09f6cd173c657..f601f3277a541 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6674,7 +6674,8 @@ def set_gguf_parameters(self): # Add Falcon Mamba2 specific configuration self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"]) - self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_inner", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_ssm_inner_size(self.hparams["mamba_d_ssm"]) self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) self.gguf_writer.add_uint32("falcon_h1.num_key_value_heads", self.find_hparam(["num_key_value_heads"], optional=True) or diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index ce24a6abfbfbc..b339872581c14 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -219,7 +219,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, { LLM_KV_SSM_HEAD_DIM, "%s.ssm.head_dim" }, - { LLM_KV_MAMBA_D_SSM, "%s.ssm.mamba_d_ssm" }, { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index a14daf0ede8c3..3b03308b8f80e 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -160,7 +160,6 @@ enum llm_kv { // Falcon-H1 specific LLM_KV_ATTN_HEAD_DIM, LLM_KV_SSM_HEAD_DIM, - LLM_KV_MAMBA_D_SSM, LLM_KV_N_LAYER, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 3196138b359af..bf7aece8de4e1 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -76,12 +76,7 @@ uint32_t llama_hparams::n_embd_r() const { // Corresponds to Mamba's conv_states size // check if the architecture is using d_ssm - if (ssm_mamba_d_ssm > 0) { - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_mamba_d_ssm + 2*ssm_n_group*ssm_d_state); - } else { - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); - } - + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); } uint32_t llama_hparams::n_embd_s() const { @@ -91,7 +86,7 @@ uint32_t llama_hparams::n_embd_s() const { } // corresponds to Mamba's ssm_states size - return (ssm_mamba_d_ssm > 0 ? ssm_d_state * ssm_mamba_d_ssm : ssm_d_state * ssm_d_inner); + return ssm_d_state * ssm_d_inner; } bool llama_hparams::is_recurrent(uint32_t il) const { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index dd49d9bfc426b..763e7f8e1c056 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -116,7 +116,6 @@ struct llama_hparams { uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; uint32_t ssm_head_dim = 0; - uint32_t ssm_mamba_d_ssm = 0; uint32_t attn_head_dim = 0; bool mamba_rms_norm = false; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0e9da3c410dcf..ce84c7a504e48 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1556,7 +1556,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); // SSM parameters - ml.get_key(LLM_KV_MAMBA_D_SSM, hparams.ssm_mamba_d_ssm); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); @@ -4520,7 +4519,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size - const int64_t ssm_intermediate_size = hparams.ssm_mamba_d_ssm > 0 ? hparams.ssm_mamba_d_ssm : int(hparams.mamba_expand * hidden_size); // TODO expand + const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size; const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads; @@ -14777,10 +14776,10 @@ struct llm_build_falcon_h1 : public llm_graph_context { const auto kv_head = kv_state->get_head(); const int64_t d_conv = hparams.ssm_d_conv; - const int64_t d_ssm = hparams.ssm_mamba_d_ssm; + const int64_t d_inner = hparams.ssm_d_inner; const int64_t d_state = hparams.ssm_d_state; const int64_t n_head = hparams.ssm_dt_rank; - const int64_t head_dim = hparams.ssm_head_dim == 0 ? d_ssm / n_head : hparams.ssm_head_dim; + const int64_t head_dim = hparams.ssm_head_dim == 0 ? d_inner / n_head : hparams.ssm_head_dim; const int64_t n_group = hparams.ssm_n_group; const int64_t n_seqs = ubatch.n_seqs; @@ -14794,7 +14793,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * ssm_states_all = kv_state->get_s_l(il); ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); - conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_ssm + 2*n_group*d_state, n_seqs); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs); // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); @@ -14807,8 +14806,8 @@ struct llm_build_falcon_h1 : public llm_graph_context { // split the above in three ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0); - ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_ssm + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_ssm*ggml_element_size(zxBCdt)); - ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_ssm + 2*n_group*d_state)*ggml_element_size(zxBCdt)); + ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt)); + ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt)); // conv { @@ -14816,13 +14815,13 @@ struct llm_build_falcon_h1 : public llm_graph_context { ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0); // copy last (d_conv - 1) columns back into the state cache - ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_ssm + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, - (d_conv - 1)*(d_ssm + 2*n_group*d_state)*(n_seqs), - kv_head*(d_conv - 1)*(d_ssm + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); // 1D convolution // The equivalent is to make a self-overlapping view of conv_x @@ -14846,9 +14845,9 @@ struct llm_build_falcon_h1 : public llm_graph_context { // These correspond to V K Q in SSM/attention duality ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0); - ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_ssm*ggml_element_size(xBC)); + ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC)); - ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_ssm + n_group*d_state)*ggml_element_size(xBC)); + ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC)); // {n_head, n_seq_tokens, n_seqs} dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b); @@ -14871,8 +14870,8 @@ struct llm_build_falcon_h1 : public llm_graph_context { // store last states ggml_build_forward_expand(gf, ggml_cpy(ctx0, - ggml_view_1d(ctx0, y_ssm, d_state*d_ssm*n_seqs, ggml_nelements(x)*x->nb[0]), - ggml_view_1d(ctx0, ssm_states_all, d_state*d_ssm*n_seqs, kv_head*d_state*d_ssm*ggml_element_size(ssm_states_all)))); + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0); @@ -14883,11 +14882,11 @@ struct llm_build_falcon_h1 : public llm_graph_context { // grouped RMS norm if (hparams.mamba_rms_norm){ - y = ggml_reshape_4d(ctx0, y, d_ssm / n_group, n_group, n_seq_tokens, n_seqs); + y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs); y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); } - y = ggml_reshape_3d(ctx0, y, d_ssm, n_seq_tokens, n_seqs); + y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs); // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} cur = build_lora_mm(model.layers[il].ssm_out, y); From 67b26642906db886ef91a045833832ae32899b7a Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 10:20:17 +0400 Subject: [PATCH 49/98] cleaning unused hparams --- src/llama-hparams.h | 9 --------- src/llama-model.cpp | 39 +++++++++++++++------------------------ 2 files changed, 15 insertions(+), 33 deletions(-) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index dd49d9bfc426b..d508817db41bb 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -118,15 +118,6 @@ struct llama_hparams { uint32_t ssm_head_dim = 0; uint32_t ssm_mamba_d_ssm = 0; - uint32_t attn_head_dim = 0; - bool mamba_rms_norm = false; - uint32_t vocab_size = 0; - uint32_t intermediate_size = 0; - float mamba_expand = 0.0f; - bool ssm_rms_norm = false; - bool ssm_conv_bias = false; - bool ssm_proj_bias = false; - // for hybrid state space models std::array recurrent_layer_arr; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0e9da3c410dcf..3ac307dab38a5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1552,7 +1552,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { case LLM_ARCH_FALCON_H1: { // Common parameters - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.vocab_size); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); // SSM parameters @@ -1564,10 +1563,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); ml.get_key(LLM_KV_SSM_HEAD_DIM, hparams.ssm_head_dim); - // Falcon-H1 parameters - ml.get_key(LLM_KV_ATTN_HEAD_DIM, hparams.attn_head_dim); - ml.get_key(LLM_KV_FALCON_H1_MAMBA_RMS_NORM, hparams.mamba_rms_norm); - std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); switch (hparams.n_layer) { @@ -4514,31 +4509,29 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { // Common const int64_t hidden_size = hparams.n_embd; // hidden_size - const int64_t vocab_size = hparams.vocab_size; // vocab_size // mamba2 Mixer SSM params const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size - const int64_t ssm_intermediate_size = hparams.ssm_mamba_d_ssm > 0 ? hparams.ssm_mamba_d_ssm : int(hparams.mamba_expand * hidden_size); // TODO expand + const int64_t ssm_mamba_d_ssm = hparams.ssm_mamba_d_ssm; const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads - const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size; - const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads; + const int64_t ssm_conv_dim = ssm_mamba_d_ssm + 2 * ssm_n_groups * ssm_state_size; + const int64_t ssm_projection_size = ssm_mamba_d_ssm + ssm_conv_dim + ssm_num_heads; // attn params const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head const int64_t attn_num_key_value_head = hparams.n_head_kv(0); - const int64_t attn_head_dim = hparams.attn_head_dim > 0 ? hparams.attn_head_dim : hidden_size / attn_num_attention_head; // ffn params const int64_t ffn_intermediate_size = hparams.n_ff(0); // embeddings - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, vocab_size}, 0); + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0); // output { - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, vocab_size}, TENSOR_NOT_REQUIRED); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); final_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); } @@ -4558,21 +4551,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); // ssm_norm - if (hparams.mamba_rms_norm == true) { - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); - } + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_mamba_d_ssm / ssm_n_groups, ssm_n_groups}, 0); // out_proj - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_mamba_d_ssm, hidden_size}, 0); /*ATTENTION LAYERS*/ // attention layers (with optional bias) - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, attn_head_dim * attn_num_attention_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * attn_head_dim}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * attn_head_dim}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {attn_head_dim * attn_num_attention_head, hidden_size}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0); layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * attn_head_dim}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * attn_head_dim}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0); @@ -14717,7 +14708,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { inpSA = ggml_add(ctx0, cur, inpSA); cb(cur, "layer_out", il); - if (il == n_layer - 1) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -14882,7 +14873,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); // grouped RMS norm - if (hparams.mamba_rms_norm){ + if (model.layers[il].ssm_norm) { y = ggml_reshape_4d(ctx0, y, d_ssm / n_group, n_group, n_seq_tokens, n_seqs); y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); } From e63ee4649e148f4ed329c26cb993af83a42e0a71 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 10:31:12 +0400 Subject: [PATCH 50/98] cleanup --- src/llama-model.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 96b377d64e19e..140ec5b8e790c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4515,8 +4515,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads - const int64_t ssm_conv_dim = ssm_mamba_d_ssm + 2 * ssm_n_groups * ssm_state_size; - const int64_t ssm_projection_size = ssm_mamba_d_ssm + ssm_conv_dim + ssm_num_heads; + const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size; + const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads; // attn params const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head @@ -4550,9 +4550,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); // ssm_norm - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_mamba_d_ssm / ssm_n_groups, ssm_n_groups}, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); // out_proj - layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_mamba_d_ssm, hidden_size}, 0); + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); /*ATTENTION LAYERS*/ // attention layers (with optional bias) @@ -14873,7 +14873,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { // grouped RMS norm if (model.layers[il].ssm_norm) { - y = ggml_reshape_4d(ctx0, y, d_ssm / n_group, n_group, n_seq_tokens, n_seqs); + y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs); y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); } From d473d42832d9cd6913b2ba7f329587c4a1d3869f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 10:39:12 +0400 Subject: [PATCH 51/98] more cleanups --- convert_hf_to_gguf.py | 13 ++++++------- src/llama-arch.cpp | 3 +-- src/llama-arch.h | 1 - src/llama-model.cpp | 2 +- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f601f3277a541..db837a2cbb675 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6673,17 +6673,16 @@ def set_gguf_parameters(self): # Add Falcon Mamba2 specific configuration - self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"]) - self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_inner", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_ssm_head_dim(self.hparams["mamba_d_head"]) self.gguf_writer.add_ssm_inner_size(self.hparams["mamba_d_ssm"]) - self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) - self.gguf_writer.add_uint32("falcon_h1.num_key_value_heads", - self.find_hparam(["num_key_value_heads"], optional=True) or - self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_head_count(self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_key_length(self.hparams["head_dim"]) + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads"], optional=True) or + self.find_hparam(["num_attention_heads"])) # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) ###### CONVERSION LOGIC ###### diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b339872581c14..b6bd815090921 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -128,8 +128,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, - { LLM_KV_ATTN_HEAD_DIM, "%s.attention.head_dim" }, - + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 3b03308b8f80e..f6ad0fb00a233 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -158,7 +158,6 @@ enum llm_kv { LLM_KV_ATTENTION_LAYER_INDICES, // Falcon-H1 specific - LLM_KV_ATTN_HEAD_DIM, LLM_KV_SSM_HEAD_DIM, LLM_KV_N_LAYER, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 140ec5b8e790c..18c96a49c0e76 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4550,7 +4550,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); // ssm_norm - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 1); // out_proj layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); From 8555ee8b2c0675170de30e218051f9bf5a5eeefe Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 10:41:33 +0400 Subject: [PATCH 52/98] more cleanups on python conversion; --- convert_hf_to_gguf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index db837a2cbb675..67ce07e17ea32 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6665,7 +6665,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) - self.gguf_writer.add_float64("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" From 7846c67e5c50332195769be5a3206e7e89155939 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 10:42:15 +0400 Subject: [PATCH 53/98] minor cleanups --- src/llama-arch.cpp | 4 ---- src/llama-arch.h | 2 -- 2 files changed, 6 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b339872581c14..46eeea814fe3c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -218,10 +218,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_SSM_HEAD_DIM, "%s.ssm.head_dim" }, - - { LLM_KV_FALCON_H1_MAMBA_RMS_NORM, "%s.mamba_rms_norm" }, - { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 3b03308b8f80e..0bd515a28e73c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -159,9 +159,7 @@ enum llm_kv { // Falcon-H1 specific LLM_KV_ATTN_HEAD_DIM, - LLM_KV_SSM_HEAD_DIM, LLM_KV_N_LAYER, - LLM_KV_FALCON_H1_MAMBA_RMS_NORM, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, From 2dee7cf96479ea7cfc50dc5bb0520cba6a51aaca Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 10:43:50 +0400 Subject: [PATCH 54/98] Apply suggestions from code review Co-authored-by: Georgi Gerganov --- gguf-py/gguf/constants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 84a91e82ca154..5ae4eee802fc5 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -289,7 +289,7 @@ class MODEL_ARCH(IntEnum): LLAMA4 = auto() DECI = auto() FALCON = auto() - FALCON_H1 = auto() + FALCON_H1 = auto() BAICHUAN = auto() GROK = auto() GPT2 = auto() @@ -662,7 +662,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", MODEL_ARCH.ERNIE4_5: "ernie4_5", - MODEL_ARCH.FALCON_H1: "falcon_h1", + MODEL_ARCH.FALCON_H1: "falcon_h1", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { From a846d02327b08e58528d5967ad5cfe3f30f2087a Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 10:44:59 +0400 Subject: [PATCH 55/98] remove todo --- gguf-py/gguf/tensor_mapping.py | 5 +---- src/llama-arch.cpp | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ff3f273bd5562..b8b081a90b37d 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -583,6 +583,7 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_NORM: ( + "model.layers.{bid}.mamba.norm", # falcon-h1 "backbone.layers.{bid}.mixer.norm", # mamba2 ), @@ -1177,10 +1178,6 @@ class TensorNameMap: "resampler.attn.out_proj", ), - MODEL_TENSOR.SSM_NORM: ( - "model.layers.{bid}.mamba.norm", - ), - MODEL_TENSOR.V_RESMPL_KV: ( "resampler.kv_proj", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b6bd815090921..f6c092cf20a15 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1955,8 +1955,7 @@ bool llm_arch_is_recurrent(const llm_arch & arch) { } bool llm_arch_is_hybrid(const llm_arch & arch) { - // TODO: There are currently no hybrid models! Once there are, this will be - // the place to identify them + // List all mamba-attention hybrid models here switch (arch) { case LLM_ARCH_FALCON_H1: return true; From f266d145fc96c8a9ba24ce0b10b1cd23c590a9f9 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 10:53:48 +0400 Subject: [PATCH 56/98] added falcon-h1 --- convert_hf_to_gguf_update.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2f733f0973686..482abe66d47bd 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -86,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, From 4bc9e0ca897cda2c452d4624159427f8fe18b8d9 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 10:56:34 +0400 Subject: [PATCH 57/98] tensor not required --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 08fb392603616..04b8f6d118af3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4549,7 +4549,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); // ssm_norm - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, llama_model_loader::TENSOR_NOT_REQUIRED); // out_proj layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); From 2834a4ac1083fe3eb3edb9b8828d20e07385583b Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 11:00:30 +0400 Subject: [PATCH 58/98] clean --- convert_hf_to_gguf.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 67ce07e17ea32..d6cd5754832b9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6669,14 +6669,6 @@ def set_gguf_parameters(self): ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" - - - # Add Falcon Mamba2 specific configuration - self.gguf_writer.add_ssm_head_dim(self.hparams["mamba_d_head"]) - self.gguf_writer.add_ssm_inner_size(self.hparams["mamba_d_ssm"]) - self.gguf_writer.add_head_count(self.find_hparam(["num_attention_heads"])) - self.gguf_writer.add_key_length(self.hparams["head_dim"]) - self.gguf_writer.add_value_length(self.hparams["head_dim"]) self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) From 823696bab18d0355eb68c01a96c77c6392ac9428 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 11:15:21 +0400 Subject: [PATCH 59/98] remove unneeded attributes --- src/llama-arch.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/llama-arch.h b/src/llama-arch.h index 0bd515a28e73c..f84cd217a091f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -157,10 +157,6 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ATTENTION_LAYER_INDICES, - // Falcon-H1 specific - LLM_KV_ATTN_HEAD_DIM, - LLM_KV_N_LAYER, - LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, From adff470c8a2258f6a53b8b684645366d96a3d059 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 11:19:38 +0400 Subject: [PATCH 60/98] more cleanups and fixed conversion --- convert_hf_to_gguf.py | 2 +- src/llama-arch.cpp | 1 - src/llama-arch.h | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d6cd5754832b9..27bdf1fd993c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6655,7 +6655,7 @@ def set_gguf_parameters(self): ## Mamba mixer params ## self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) self.gguf_writer.add_ssm_group_count(self.n_group) - self.gguf_writer.add_ssm_inner_size(self.d_inner) + self.gguf_writer.add_ssm_inner_size(self.find_hparam(["mamba_d_ssm"])) self.gguf_writer.add_ssm_head_dim(d_head := self.find_hparam(["d_head"])) self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"])) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f5cc8edb30580..2db2abc607523 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1736,7 +1736,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, {LLM_TENSOR_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, - {LLM_TENSOR_FINAL_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_DEC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ENC_OUTPUT_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_ROPE_FREQS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ROPE}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index f84cd217a091f..2bcd2c3cde3f9 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -390,7 +390,6 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, - LLM_TENSOR_FINAL_NORM, }; enum llm_tensor_layer { From 097df0ed855bba81a2fe1e8878119aadd9f2810f Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 11:26:04 +0400 Subject: [PATCH 61/98] remove final_norm --- src/llama-model.cpp | 4 ++-- src/llama-model.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04b8f6d118af3..421ebe9628b9b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4530,7 +4530,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output { output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); - final_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); } for (int i = 0; i < n_layer; ++i) { @@ -14740,7 +14740,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { cur = inpL; cur = build_norm(cur, - model.final_norm, NULL, + model.output_norm, NULL, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); diff --git a/src/llama-model.h b/src/llama-model.h index 1f089ebd2e128..099685b474e41 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -173,7 +173,6 @@ struct llama_layer { struct ggml_tensor * attn_norm_cross = nullptr; struct ggml_tensor * attn_norm_enc = nullptr; struct ggml_tensor * ssm_norm = nullptr; - struct ggml_tensor * final_norm = nullptr; // attention struct ggml_tensor * wq = nullptr; @@ -365,7 +364,6 @@ struct llama_model { struct ggml_tensor * output = nullptr; struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; - struct ggml_tensor * final_norm = nullptr; // classifier struct ggml_tensor * cls = nullptr; From 9a048d8de97c5091bd7649c2bf215629864546db Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 11:45:58 +0400 Subject: [PATCH 62/98] flake8 fixes --- convert_hf_to_gguf.py | 14 ++------------ gguf-py/gguf/tensor_mapping.py | 14 +++++++------- 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 27bdf1fd993c1..4388f2462b54c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -686,14 +686,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base res = "falcon3" - if ( - chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86" or - chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896" or - chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b" or - chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6" - ): - # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df - res = "falcon_h1" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 res = "bert-bge-large" @@ -6608,7 +6600,7 @@ def _generate_mup_vector(self, block_id: int) -> torch.Tensor: def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: tensors = list(super().modify_tensors(data_torch, name, bid)) - tensor = tensors[0][1] + tensor = tensors[0][1] if "down_proj" in name: tensor = tensor * self.mlp_multipliers[1] @@ -6669,9 +6661,7 @@ def set_gguf_parameters(self): ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" - self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads"], optional=True) or - self.find_hparam(["num_attention_heads"])) - + self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) # Add any other Falcon Mamba2 specific configuration self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b8b081a90b37d..f275e9d481b24 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -293,7 +293,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_POST_NORM: ( "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 - "model.layers.{bid}.feed_forward.up_proj", + "model.layers.{bid}.feed_forward.up_proj", ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -364,7 +364,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 - "model.layers.{bid}.feed_forward.down_proj", + "model.layers.{bid}.feed_forward.down_proj", ), # AWQ-activation gate @@ -550,13 +550,13 @@ class TensorNameMap: MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", - "model.layers.{bid}.mamba.in_proj", + "model.layers.{bid}.mamba.in_proj", ), MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", - "model.layers.{bid}.mamba.conv1d", + "model.layers.{bid}.mamba.conv1d", ), MODEL_TENSOR.SSM_X: ( @@ -567,13 +567,13 @@ class TensorNameMap: MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", - "model.layers.{bid}.mamba.dt_proj", + "model.layers.{bid}.mamba.dt_proj", ), MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", - "model.layers.{bid}.mamba.A_log", + "model.layers.{bid}.mamba.A_log", ), MODEL_TENSOR.SSM_D: ( @@ -1177,7 +1177,7 @@ class TensorNameMap: MODEL_TENSOR.V_RESMPL_ATTN_OUT: ( "resampler.attn.out_proj", ), - + MODEL_TENSOR.V_RESMPL_KV: ( "resampler.kv_proj", ), From 58e3866d02260ffd3baea6dc0530c528ffb66228 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 12:30:55 +0400 Subject: [PATCH 63/98] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 421ebe9628b9b..cdc753965c7b0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14868,7 +14868,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { // TODO: skip computing output earlier for unused tokens y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); - y = ggml_mul(ctx0, y, ggml_silu(ctx0, ggml_cont(ctx0, z))); + y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y); // grouped RMS norm if (model.layers[il].ssm_norm) { From 9b9264830253138f8cb0b4792fc71adf310f05de Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 13:14:47 +0400 Subject: [PATCH 64/98] flake8 fixes --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c015be535327b..440898a45e4b9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6669,7 +6669,7 @@ def set_gguf_parameters(self): # Add any other Falcon Mamba2 specific configuration self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) - + @ModelBase.register("HunYuanMoEV1ForCausalLM") class HunYuanMoEModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE From 7fe1794cc3b52500792049605e4744d999c65dd7 Mon Sep 17 00:00:00 2001 From: ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:22:56 +0400 Subject: [PATCH 65/98] Update src/llama-hparams.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-hparams.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index bf7aece8de4e1..64e7831cf649d 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -74,8 +74,8 @@ uint32_t llama_hparams::n_embd_r() const { // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size - - // check if the architecture is using d_ssm + + // check if the architecture is using d_ssm return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); } From 40058c043f88a6d25e089c8854485dc8c121c2ef Mon Sep 17 00:00:00 2001 From: ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:23:10 +0400 Subject: [PATCH 66/98] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index af37a83438169..1d9c573a8228c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4600,7 +4600,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (output == NULL) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } - + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); From debf4e5dd5bf9bb1c0ec8353e45a60cb9e9af826 Mon Sep 17 00:00:00 2001 From: ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:23:19 +0400 Subject: [PATCH 67/98] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1d9c573a8228c..4eb0cd5d13f5e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14738,7 +14738,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { inpSA = ggml_add(ctx0, cur, inpSA); cb(cur, "layer_out", il); - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } From 212edffd868780d541375c808bdf203447c6a4bb Mon Sep 17 00:00:00 2001 From: ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:23:37 +0400 Subject: [PATCH 68/98] Update src/llama-arch.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6509ca7f37470..10c38d3691cc2 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -129,7 +129,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, - + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, From 90ddf2412ad3ecc154c409da1cfd17d2e884367f Mon Sep 17 00:00:00 2001 From: ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:23:56 +0400 Subject: [PATCH 69/98] Update convert_hf_to_gguf.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c015be535327b..440898a45e4b9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6669,7 +6669,7 @@ def set_gguf_parameters(self): # Add any other Falcon Mamba2 specific configuration self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) - + @ModelBase.register("HunYuanMoEV1ForCausalLM") class HunYuanMoEModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE From c3c5d51c6ab60a2ed738d2bf7954bcaf4fac9109 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 13:37:14 +0400 Subject: [PATCH 70/98] added hashes --- convert_hf_to_gguf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 440898a45e4b9..d5c7e2e9d8d21 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -680,6 +680,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" + if (chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86" or chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896" or chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b" or chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"): + # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df + res = "falcon_h1" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" From f8d7c970a7b7776e2f7329170f0dd3e11176cdf1 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:54:53 +0400 Subject: [PATCH 71/98] Update src/llama-arch.cpp Co-authored-by: Georgi Gerganov --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 10c38d3691cc2..8dc538d738495 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -46,7 +46,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA2, "mamba2" }, - { LLM_ARCH_FALCON_H1, "falcon_h1" }, + { LLM_ARCH_FALCON_H1, "falcon-h1" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, From 4610ee202008b5c39ea296af143c607db5cb46b6 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 13:55:00 +0400 Subject: [PATCH 72/98] Update src/llama-vocab.cpp Co-authored-by: Georgi Gerganov --- src/llama-vocab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index ec356cec45804..b7f14dc07b609 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1523,7 +1523,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || - tokenizer_pre == "falcon_h1" || + tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; From 082ab4ad2a3927384d878666a5f8cae4eb15f577 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 13:55:54 +0400 Subject: [PATCH 73/98] update the update file --- convert_hf_to_gguf_update.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 721eb1d86dd09..e065292133b15 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -86,7 +86,10 @@ class TOKENIZER_TYPE(IntEnum): {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, - {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", }, + {"name": "falcon-h1-500m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", }, + {"name": "falcon-h1-1b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", }, + {"name": "falcon-h1-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", }, + {"name": "falcon-h1-34b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, From c5515e3656260ddd55851431456e335d404a7c28 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 13:56:39 +0400 Subject: [PATCH 74/98] Revert "update the update file" This reverts commit 082ab4ad2a3927384d878666a5f8cae4eb15f577. --- convert_hf_to_gguf_update.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index e065292133b15..721eb1d86dd09 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -86,10 +86,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, - {"name": "falcon-h1-500m", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", }, - {"name": "falcon-h1-1b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", }, - {"name": "falcon-h1-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", }, - {"name": "falcon-h1-34b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", }, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, From 1ef53b31130398e6881545206559afecc4a0d455 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 14:03:56 +0400 Subject: [PATCH 75/98] fix: address suggestions --- convert_hf_to_gguf_update.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 721eb1d86dd09..15a326e695dd5 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -86,7 +86,6 @@ class TOKENIZER_TYPE(IntEnum): {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, - {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, @@ -139,6 +138,11 @@ class TOKENIZER_TYPE(IntEnum): {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"}, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"}, {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"}, + # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"}, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"}, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"}, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"}, ] From d5efbd012fae1c029d79d6755b8454e55f692689 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 12:05:28 +0200 Subject: [PATCH 76/98] fix: update convert_hf_to_gguf.py --- convert_hf_to_gguf.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d5c7e2e9d8d21..adf115fca50fa 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -680,9 +680,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" - if (chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86" or chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896" or chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b" or chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"): - # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df - res = "falcon_h1" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" @@ -821,6 +818,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct res = "hunyuan" + if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": + # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base + res = "falcon-h1" + if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base + res = "falcon-h1" + if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896": + # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base + res = "falcon-h1" + if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": + # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base + res = "falcon-h1" if res is None: logger.warning("\n") From a5afc8bcbae34f412aa653fe8c189076ed89a004 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 14:14:04 +0400 Subject: [PATCH 77/98] Update gguf-py/gguf/constants.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4eed7ad6cddc9..b7b7821725524 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -663,7 +663,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", MODEL_ARCH.ERNIE4_5: "ernie4_5", - MODEL_ARCH.FALCON_H1: "falcon_h1", + MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", } From 99f9a3dd89d965df62aa1e450ed3a083154ac55c Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 14:53:57 +0400 Subject: [PATCH 78/98] Update src/llama-model-loader.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model-loader.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index a2eed670acb63..bd9e6da8832b7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -400,7 +400,6 @@ namespace GGUFMeta { template bool llama_model_loader::get_key (enum llm_kv kid, bool & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, float & result, bool required); - template bool llama_model_loader::get_key (enum llm_kv kid, double & result, bool required); template bool llama_model_loader::get_key (enum llm_kv kid, uint32_t & result, bool required); template bool llama_model_loader::get_key(enum llm_kv kid, std::string & result, bool required); From c3c64c3583180f1f378ddfd1311105ced7f214d6 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 15:31:26 +0400 Subject: [PATCH 79/98] d_inner fixed --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index adf115fca50fa..45ca9658b9db1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6570,7 +6570,7 @@ def __init__(self, *args, **kwargs): # n_group and d_inner are used during reshape_tensors for mamaba2 self.d_model = self.find_hparam(["hidden_size", "d_model"]) self.n_group = self.find_hparam(["n_groups"]) - self.d_inner = self.find_hparam(["expand"]) * self.d_model + self.d_inner = self.find_hparam(["mamba_d_ssm"]) # Initialize any Falcon Mamba2 specific attributes self.has_attention = True # Falcon Mamba2 has attention components From 63e3afc93f57291b299d81584f1873d69b1e7adf Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Tue, 8 Jul 2025 15:53:21 +0400 Subject: [PATCH 80/98] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4eb0cd5d13f5e..4fa064c4b6f1f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4550,17 +4550,17 @@ bool llama_model::load_tensors(llama_model_loader & ml) { /*SSM LAYERS*/ // ssm in layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0); - layer.ssm_in_b = create_tensor(tn(LLM_TENSOR_SSM_IN, "bias", i), {n_embd, ssm_projection_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ssm_in_b = create_tensor(tn(LLM_TENSOR_SSM_IN, "bias", i), {n_embd, ssm_projection_size}, TENSOR_NOT_REQUIRED); // ssm 1d conv layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0); - layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED); // ssm_dt layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0); // no "weight" suffix for these layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); // ssm_norm - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED); // out_proj layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); @@ -4570,23 +4570,23 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0); layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0); - layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0); // feed forward (w/ optional biases) layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0); - layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0); - layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED); } } break; case LLM_ARCH_HUNYUAN_MOE: From d7585783d8f4387a5e216c47f7383685d082a43f Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 19:44:46 +0400 Subject: [PATCH 81/98] reshaping ssm_norm for 34B --- convert_hf_to_gguf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 45ca9658b9db1..c4a618f72c8e3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6645,6 +6645,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensor = tensor * self.hparams["lm_head_multiplier"] elif "embed_tokens" in name: tensor = tensor * self.hparams["embedding_multiplier"] + elif "mamba.norm" in name: + tensor = tensor.reshape(self.n_group, self.hparams["mamba_d_ssm"] // self.n_group) tensors = [(tensors[0][0], tensor)] return tensors From 7897c217d4cd221d9f2d90ebb7521c22c8b4fea5 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 19:49:00 +0400 Subject: [PATCH 82/98] removing generate_mup --- convert_hf_to_gguf.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c4a618f72c8e3..7d21fcb49bbfe 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6568,7 +6568,6 @@ def __init__(self, *args, **kwargs): self._transformer_model_class = LlamaModel # n_group and d_inner are used during reshape_tensors for mamaba2 - self.d_model = self.find_hparam(["hidden_size", "d_model"]) self.n_group = self.find_hparam(["n_groups"]) self.d_inner = self.find_hparam(["mamba_d_ssm"]) @@ -6598,21 +6597,6 @@ def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: def set_vocab(self): self._set_vocab_gpt2() - def _generate_mup_vector(self, block_id: int) -> torch.Tensor: - zxbcdt_multipliers = self.hparams["ssm_multipliers"] - intermediate_size = self.hparams["mamba_d_ssm"] - groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] - vector_shape = (2 * intermediate_size + 2 * groups_time_state_size + self.hparams["mamba_n_heads"]) - - mup_vector = torch.ones(1, 1, vector_shape, dtype=torch.float64) - mup_vector[:, :, :intermediate_size] *= zxbcdt_multipliers[0] - mup_vector[:, :, intermediate_size:2 * intermediate_size] *= zxbcdt_multipliers[1] - mup_vector[:, :, 2 * intermediate_size:2 * intermediate_size + groups_time_state_size] *= zxbcdt_multipliers[2] - mup_vector[:, :, 2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size] *= zxbcdt_multipliers[3] - mup_vector[:, :, 2 * intermediate_size + 2 * groups_time_state_size:] *= zxbcdt_multipliers[4] - - return mup_vector - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: tensors = list(super().modify_tensors(data_torch, name, bid)) tensor = tensors[0][1] @@ -6646,7 +6630,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter elif "embed_tokens" in name: tensor = tensor * self.hparams["embedding_multiplier"] elif "mamba.norm" in name: - tensor = tensor.reshape(self.n_group, self.hparams["mamba_d_ssm"] // self.n_group) + tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group) tensors = [(tensors[0][0], tensor)] return tensors From 6403caae9fa7dbad75400aa90fafbd011ddd5ad1 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 20:39:49 +0400 Subject: [PATCH 83/98] remove duplicates metadata keys --- convert_hf_to_gguf.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7d21fcb49bbfe..eed230594cb4c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6570,6 +6570,7 @@ def __init__(self, *args, **kwargs): # n_group and d_inner are used during reshape_tensors for mamaba2 self.n_group = self.find_hparam(["n_groups"]) self.d_inner = self.find_hparam(["mamba_d_ssm"]) + self.d_head = self.find_hparam(["d_head"]) # Initialize any Falcon Mamba2 specific attributes self.has_attention = True # Falcon Mamba2 has attention components @@ -6639,30 +6640,25 @@ def set_gguf_parameters(self): super().set_gguf_parameters() ## General Params ## - self.gguf_writer.add_block_count(self.block_count) - self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) self.gguf_writer.add_add_bos_token(False) + # Override some Mamba2 defaults + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) ## Mamba mixer params ## - self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) - self.gguf_writer.add_ssm_group_count(self.n_group) - self.gguf_writer.add_ssm_inner_size(self.find_hparam(["mamba_d_ssm"])) - self.gguf_writer.add_ssm_head_dim(d_head := self.find_hparam(["d_head"])) - self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"])) + self.gguf_writer.add_ssm_head_dim(self.d_head) ## Attention params ## - self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) - self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"] if "num_key_value_heads" in self.hparams else self.hparams["num_attention_heads"]) - self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" - assert self.d_inner % d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {d_head}" - self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) + assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" # Add any other Falcon Mamba2 specific configuration self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) From 710630a51b1d9c2e76084a599033e5f8dc7c319c Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Tue, 8 Jul 2025 21:03:14 +0400 Subject: [PATCH 84/98] rm comment --- src/llama-hparams.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 64e7831cf649d..86c814d51b901 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -74,8 +74,6 @@ uint32_t llama_hparams::n_embd_r() const { // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size - - // check if the architecture is using d_ssm return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); } From ecc5253dfa9dd3077bba0a01a1a1e9361ef1dc47 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 21:37:06 +0400 Subject: [PATCH 85/98] final comment --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3773ff6f214af..2732147181f27 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14840,7 +14840,7 @@ struct llm_build_falcon_h1 : public llm_graph_context { const int64_t d_inner = hparams.ssm_d_inner; const int64_t d_state = hparams.ssm_d_state; const int64_t n_head = hparams.ssm_dt_rank; - const int64_t head_dim = hparams.ssm_head_dim == 0 ? d_inner / n_head : hparams.ssm_head_dim; + const int64_t head_dim = d_inner / n_head; const int64_t n_group = hparams.ssm_n_group; const int64_t n_seqs = ubatch.n_seqs; From bbca33ebcace2cbb1c63cfb4f48e0855c8349f00 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 22:51:39 +0400 Subject: [PATCH 86/98] fix unused args --- convert_hf_to_gguf.py | 3 --- gguf-py/gguf/gguf_writer.py | 3 --- src/llama-hparams.h | 1 - 3 files changed, 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 27c93f66089ff..b468970bea3b6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6647,9 +6647,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - ## Mamba mixer params ## - self.gguf_writer.add_ssm_head_dim(self.d_head) - ## Attention params ## self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index b1d9b6f1985a3..a7ecf3d31209f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -867,9 +867,6 @@ def add_ssm_group_count(self, value: int) -> None: def add_ssm_dt_b_c_rms(self, value: bool) -> None: self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value) - def add_ssm_head_dim(self, value: int) -> None: - self.add_uint32(Keys.SSM.HEAD_DIM.format(arch=self.arch), value) - def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) diff --git a/src/llama-hparams.h b/src/llama-hparams.h index d663a0f7cbd26..476d0a5eade28 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -115,7 +115,6 @@ struct llama_hparams { uint32_t ssm_d_state = 0; uint32_t ssm_dt_rank = 0; uint32_t ssm_n_group = 0; - uint32_t ssm_head_dim = 0; // for hybrid state space models std::array recurrent_layer_arr; From 9f514e3925620ab0b41009f23e6c4e37974b1ac4 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 23:02:19 +0400 Subject: [PATCH 87/98] fix constants --- gguf-py/gguf/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3e793e62d2ae2..93eec43556c74 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -172,7 +172,6 @@ class SSM: TIME_STEP_RANK = "{arch}.ssm.time_step_rank" GROUP_COUNT = "{arch}.ssm.group_count" DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms" - HEAD_DIM = "{arch}.ssm.head_dim" class WKV: HEAD_SIZE = "{arch}.wkv.head_size" From 34c5d830bab5461dd341e8199d51addf956139d1 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Tue, 8 Jul 2025 23:07:24 +0400 Subject: [PATCH 88/98] fix bad merge --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b468970bea3b6..4766470cd4be9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6813,6 +6813,7 @@ def prepare_tensors(self): @ModelBase.register("SmolLM3ForCausalLM") class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 + ###### CONVERSION LOGIC ###### From 521e82350de29dc30fbce34edc7c5593397eb662 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 9 Jul 2025 00:08:39 +0400 Subject: [PATCH 89/98] Update src/llama-model.cpp Co-authored-by: compilade --- src/llama-model.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2732147181f27..1a7d4c2e468f7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15426,7 +15426,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding); - res = new llama_memory_hybrid( /* model */ *this, /* attn_type_k */ params.type_k, From 6943f4ea4a51cf035620bb028070328bf6a90539 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 00:11:36 +0400 Subject: [PATCH 90/98] falcon-h1: remove unused ssm_in_b and bad merge --- src/llama-model.cpp | 2 +- src/llama-model.h | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1a7d4c2e468f7..1d4dd798acfbf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4560,7 +4560,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { /*SSM LAYERS*/ // ssm in layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0); - layer.ssm_in_b = create_tensor(tn(LLM_TENSOR_SSM_IN, "bias", i), {n_embd, ssm_projection_size}, TENSOR_NOT_REQUIRED); // ssm 1d conv layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0); layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED); @@ -4613,6 +4612,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); diff --git a/src/llama-model.h b/src/llama-model.h index 4e4e7cad8d9b3..70a6dc89e1b06 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -225,9 +225,6 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; - // falcon_h1 - struct ggml_tensor * ssm_in_b = nullptr; - // ff MoE struct ggml_tensor * ffn_gate_inp = nullptr; struct ggml_tensor * ffn_gate_exps = nullptr; From 4d2c94b72946f941c07781d96efcdece6a7f2529 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 9 Jul 2025 00:46:11 +0400 Subject: [PATCH 91/98] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1d4dd798acfbf..6a61deeb74e18 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1575,6 +1575,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { type = LLM_TYPE_3B; break; case 44: type = LLM_TYPE_7B; break; + default: + type = LLM_TYPE_UNKNOWN; } } break; case LLM_ARCH_HUNYUAN_MOE: From b7c9a995f2ac5964cf6c7a70194d8b2b84b7ac46 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 00:48:52 +0400 Subject: [PATCH 92/98] falcon-h1: fix last comment --- src/llama-model.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6a61deeb74e18..4b3387265a08b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1575,6 +1575,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { type = LLM_TYPE_3B; break; case 44: type = LLM_TYPE_7B; break; + case 72: + type = LLM_TYPE_34B; break; default: type = LLM_TYPE_UNKNOWN; } From 9fd308d831529db23776e727ef5589bf3d862187 Mon Sep 17 00:00:00 2001 From: Younes B <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 9 Jul 2025 08:20:48 +0400 Subject: [PATCH 93/98] Update convert_hf_to_gguf.py Co-authored-by: compilade --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4766470cd4be9..c727645ff2d6c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4913,7 +4913,7 @@ def set_gguf_parameters(self): d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 - head_dim = self.find_hparam(["head_dim"], optional=True) or 64 + head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64 n_group = self.find_hparam(["n_groups"], optional=True) or 1 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 From 51f50bfb6e1fb78ef417862dece0d8fdc600e799 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 08:28:10 +0400 Subject: [PATCH 94/98] falcon-h1: revert add_add_bos(False) --- convert_hf_to_gguf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c727645ff2d6c..189515c09ee60 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6641,7 +6641,6 @@ def set_gguf_parameters(self): ## General Params ## self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - self.gguf_writer.add_add_bos_token(False) # Override some Mamba2 defaults self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) From 367d8c58076ffaefffc8309cf9fc85469f2b2d77 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 11:33:36 +0400 Subject: [PATCH 95/98] falcon-h1: fix tied weights --- src/llama-model.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4b3387265a08b..1b1824e2b019e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4551,11 +4551,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // embeddings tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0); - + // output - { - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); } for (int i = 0; i < n_layer; ++i) { From 1fa361bb469d7ea98aa31cf1af5fde00868cfff9 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 11:37:36 +0400 Subject: [PATCH 96/98] falcon-h1: remove whitespace --- src/llama-model.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1b1824e2b019e..b882e72df1c63 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2084,7 +2084,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); From 6dde986aef21c1343b6b061019312cd0a7a7d601 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 11:38:36 +0400 Subject: [PATCH 97/98] falcon-h1: fix wrong size param --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b882e72df1c63..56e9a4cef3512 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4557,7 +4557,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // if output is NULL, init from the input tok embed if (output == NULL) { - output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED); } for (int i = 0; i < n_layer; ++i) { From 94ab3a88e9bbfce1fe9e9ff2bede0a2eff3f09d4 Mon Sep 17 00:00:00 2001 From: younesbelkada Date: Wed, 9 Jul 2025 11:46:11 +0400 Subject: [PATCH 98/98] falcon-h1: fix whitespace issues --- src/llama-model.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 56e9a4cef3512..e424350bdd783 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2084,6 +2084,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + // output output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); @@ -4550,7 +4551,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // embeddings tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0); - + // output output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0);