refactor: Remove ATTENTION_LAYER_INDICES hparam in favor of n_head_kv

gabe-l-hart · gabe-l-hart · commit 4e9fef1a96dd · 2025-07-09T21:30:02.000-06:00
This matches how recurrent vs attention heads are identified for Jamba

Branch: GraniteFour

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -6570,11 +6570,14 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
 
         ## Attention params ##
-        self.gguf_writer.add_attn_layer_indices(self._attn_layers)
+        head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
+        head_count_kv_vec = [
+            head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
+        ]
         if rope_dim := self.hparams.get("attn_rotary_emb"):
             self.gguf_writer.add_rope_dimension_count(rope_dim)
         self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads", "n_head_kv"]))
+        self.gguf_writer.add_head_count_kv(head_count_kv_vec)
 
         ## Feed Forward Params ##
         self.gguf_writer.add_layer_norm_rms_eps(
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -173,9 +173,6 @@ class SSM:
         GROUP_COUNT    = "{arch}.ssm.group_count"
         DT_B_C_RMS     = "{arch}.ssm.dt_b_c_rms"
 
-    class HybridAttention:
-        ATTN_LAYER_INDICES = "{arch}.attention.layer_indices"
-
     class WKV:
         HEAD_SIZE = "{arch}.wkv.head_size"
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -874,9 +874,6 @@ def add_ssm_group_count(self, value: int) -> None:
     def add_ssm_dt_b_c_rms(self, value: bool) -> None:
         self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
 
-    def add_attn_layer_indices(self, values: list[int]) -> None:
-        self.add_array(Keys.HybridAttention.ATTN_LAYER_INDICES.format(arch=self.arch), values)
-
     def add_tokenizer_model(self, model: str) -> None:
         self.add_string(Keys.Tokenizer.MODEL, model)
 
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
@@ -155,7 +155,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
     { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
     { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
-    { LLM_KV_ATTENTION_LAYER_INDICES,                "%s.attention.layer_indices"                },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -159,7 +159,6 @@ enum llm_kv {
     LLM_KV_ATTENTION_SCALE,
     LLM_KV_ATTENTION_KEY_LENGTH_MLA,
     LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
-    LLM_KV_ATTENTION_LAYER_INDICES,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_SECTIONS,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1538,26 +1538,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
                 hparams.rope_finetuned = rope_finetuned;
 
-                // Zero-out n_head_arr and n_head_kv_arr since SSM layers don't
-                // have attention heads. We'll set them correctly below once we
-                // know which layers are attention layers
-                // NOTE: It's important that this happens after n_embd_head_[kv]
-                //  are set above!
-                const auto n_head_attn = hparams.n_head();
-                const auto n_head_kv_attn = hparams.n_head_kv();
-                std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
-                std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
-
-                // Attention params
-                std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
-                std::vector<uint32_t> attn_layer_indices;
-                ml.get_arr(LLM_KV_ATTENTION_LAYER_INDICES, attn_layer_indices);
-                for (const auto attn_idx : attn_layer_indices) {
-                    GGML_ASSERT(attn_idx < hparams.n_layer);
-                    hparams.recurrent_layer_arr[attn_idx] = false;
-                    // Correctly set n_head and n_head_kv for attention layers
-                    hparams.n_head_arr[attn_idx] = n_head_attn;
-                    hparams.n_head_kv_arr[attn_idx] = n_head_kv_attn;
+                // A layer is recurrent IFF the n_head_kv value is set to 0
+                for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+                    hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
                 }
 
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);