Skip to content

Commit 4e9fef1

Browse files
committed
refactor: Remove ATTENTION_LAYER_INDICES hparam in favor of n_head_kv
This matches how recurrent vs attention heads are identified for Jamba Branch: GraniteFour Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 5b44f4e commit 4e9fef1

File tree

6 files changed

+8
-30
lines changed

6 files changed

+8
-30
lines changed

convert_hf_to_gguf.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6570,11 +6570,14 @@ def set_gguf_parameters(self):
65706570
self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"]))
65716571

65726572
## Attention params ##
6573-
self.gguf_writer.add_attn_layer_indices(self._attn_layers)
6573+
head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
6574+
head_count_kv_vec = [
6575+
head_count_kv if i in self._attn_layers else 0 for i in range(self.block_count)
6576+
]
65746577
if rope_dim := self.hparams.get("attn_rotary_emb"):
65756578
self.gguf_writer.add_rope_dimension_count(rope_dim)
65766579
self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
6577-
self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads", "n_head_kv"]))
6580+
self.gguf_writer.add_head_count_kv(head_count_kv_vec)
65786581

65796582
## Feed Forward Params ##
65806583
self.gguf_writer.add_layer_norm_rms_eps(

gguf-py/gguf/constants.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -173,9 +173,6 @@ class SSM:
173173
GROUP_COUNT = "{arch}.ssm.group_count"
174174
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
175175

176-
class HybridAttention:
177-
ATTN_LAYER_INDICES = "{arch}.attention.layer_indices"
178-
179176
class WKV:
180177
HEAD_SIZE = "{arch}.wkv.head_size"
181178

gguf-py/gguf/gguf_writer.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -874,9 +874,6 @@ def add_ssm_group_count(self, value: int) -> None:
874874
def add_ssm_dt_b_c_rms(self, value: bool) -> None:
875875
self.add_bool(Keys.SSM.DT_B_C_RMS.format(arch=self.arch), value)
876876

877-
def add_attn_layer_indices(self, values: list[int]) -> None:
878-
self.add_array(Keys.HybridAttention.ATTN_LAYER_INDICES.format(arch=self.arch), values)
879-
880877
def add_tokenizer_model(self, model: str) -> None:
881878
self.add_string(Keys.Tokenizer.MODEL, model)
882879

src/llama-arch.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
155155
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
156156
{ LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" },
157157
{ LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" },
158-
{ LLM_KV_ATTENTION_LAYER_INDICES, "%s.attention.layer_indices" },
159158

160159
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
161160
{ LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" },

src/llama-arch.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,6 @@ enum llm_kv {
159159
LLM_KV_ATTENTION_SCALE,
160160
LLM_KV_ATTENTION_KEY_LENGTH_MLA,
161161
LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
162-
LLM_KV_ATTENTION_LAYER_INDICES,
163162

164163
LLM_KV_ROPE_DIMENSION_COUNT,
165164
LLM_KV_ROPE_DIMENSION_SECTIONS,

src/llama-model.cpp

Lines changed: 3 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,26 +1538,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
15381538
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
15391539
hparams.rope_finetuned = rope_finetuned;
15401540

1541-
// Zero-out n_head_arr and n_head_kv_arr since SSM layers don't
1542-
// have attention heads. We'll set them correctly below once we
1543-
// know which layers are attention layers
1544-
// NOTE: It's important that this happens after n_embd_head_[kv]
1545-
// are set above!
1546-
const auto n_head_attn = hparams.n_head();
1547-
const auto n_head_kv_attn = hparams.n_head_kv();
1548-
std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0);
1549-
std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0);
1550-
1551-
// Attention params
1552-
std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true);
1553-
std::vector<uint32_t> attn_layer_indices;
1554-
ml.get_arr(LLM_KV_ATTENTION_LAYER_INDICES, attn_layer_indices);
1555-
for (const auto attn_idx : attn_layer_indices) {
1556-
GGML_ASSERT(attn_idx < hparams.n_layer);
1557-
hparams.recurrent_layer_arr[attn_idx] = false;
1558-
// Correctly set n_head and n_head_kv for attention layers
1559-
hparams.n_head_arr[attn_idx] = n_head_attn;
1560-
hparams.n_head_kv_arr[attn_idx] = n_head_kv_attn;
1541+
// A layer is recurrent IFF the n_head_kv value is set to 0
1542+
for (uint32_t i = 0; i < hparams.n_layer; ++i) {
1543+
hparams.recurrent_layer_arr[i] = hparams.n_head_kv(i) == 0;
15611544
}
15621545

15631546
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);

0 commit comments

Comments
 (0)