Skip to content

Commit d1d54d8

Browse files
committed
Merge remote-tracking branch 'origin/compilade/refactor-kv-cache' into GraniteFour
* origin/compilade/refactor-kv-cache: gguf-py : avoid adding duplicate tensor mappings for Jamba
2 parents 44cda75 + 4d6a179 commit d1d54d8

File tree

1 file changed

+6
-15
lines changed

1 file changed

+6
-15
lines changed

gguf-py/gguf/tensor_mapping.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -279,11 +279,10 @@ class TensorNameMap:
279279
"transformer.decoder_layer.{bid}.rms_norm_2", # Grok
280280
"encoder.layers.{bid}.post_attention_layernorm", # chatglm
281281
"transformer.layers.{bid}.ffn_norm", # openelm
282-
"model.layers.{bid}.pre_ff_layernorm", # jamba
282+
"model.layers.{bid}.pre_ff_layernorm", # jamba bamba
283283
"model.layers.{bid}.pre_moe_layernorm", # mini-jamba
284284
"model.layers.{bid}.post_attention_layernorm", # llama4
285285
"transformer_encoder.{bid}.ffn_norm", # neobert
286-
"model.layers.{bid}.pre_ff_layernorm", # bamba
287286
),
288287

289288
# Post feed-forward norm
@@ -305,9 +304,8 @@ class TensorNameMap:
305304
"model.layers.{bid}.mlp.gate", # qwen2moe olmoe
306305
"transformer.decoder_layer.{bid}.router", # Grok
307306
"transformer.blocks.{bid}.ffn.router.layer", # dbrx
308-
"model.layers.{bid}.feed_forward.router", # jamba
309307
"model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
310-
"model.layers.{bid}.feed_forward.router", # llama4
308+
"model.layers.{bid}.feed_forward.router", # llama4 jamba
311309
"encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe
312310
"model.layers.{bid}.mlp.gate.wg", # hunyuan
313311
),
@@ -349,12 +347,10 @@ class TensorNameMap:
349347
"encoder.layer.{bid}.mlp.gated_layers", # jina-bert-v2 (GEGLU)
350348
"encoder.layer.{bid}.mlp.up_gated_layer", # jina-v2-code (GEGLU)
351349
"model.layers.{bid}.residual_mlp.w3", # arctic
352-
"model.layers.{bid}.feed_forward.up_proj", # jamba
353350
"encoder.layers.{bid}.mlp.dense_h_to_4h", # chatglm
354351
"transformer.h.{bid}.mlp.c_fc_1", # exaone
355-
"model.layers.{bid}.feed_forward.up_proj", # llama4
352+
"model.layers.{bid}.feed_forward.up_proj", # llama4 jamba bamba
356353
"transformer_encoder.{bid}.ffn.w12", # neobert
357-
"model.layers.{bid}.feed_forward.up_proj", # bamba
358354
),
359355

360356
MODEL_TENSOR.FFN_UP_EXP: (
@@ -392,10 +388,8 @@ class TensorNameMap:
392388
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used)
393389
"transformer.h.{bid}.mlp.linear_1", # refact
394390
"model.layers.{bid}.residual_mlp.w1", # arctic
395-
"model.layers.{bid}.feed_forward.gate_proj", # jamba
396391
"transformer.h.{bid}.mlp.c_fc_0", # exaone
397-
"language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4
398-
"model.layers.{bid}.feed_forward.gate_proj", # bamba
392+
"model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba bamba
399393
),
400394

401395
MODEL_TENSOR.FFN_GATE_EXP: (
@@ -439,12 +433,10 @@ class TensorNameMap:
439433
"transformer.layers.{bid}.ffn.proj_2", # openelm
440434
"model.layers.{bid}.residual_mlp.w2", # arctic
441435
"encoder.layer.{bid}.mlp.down_layer", # jina-bert-v2
442-
"model.layers.{bid}.feed_forward.down_proj", # jamba
443436
"encoder.layers.{bid}.mlp.dense_4h_to_h", # chatglm
444437
"model.layers.h.{bid}.mlp.c_proj", # exaone
445-
"model.layers.{bid}.feed_forward.down_proj", # llama4
438+
"model.layers.{bid}.feed_forward.down_proj", # llama4 jamba bamba
446439
"transformer_encoder.{bid}.ffn.w3", # neobert
447-
"model.layers.{bid}.feed_forward.down_proj", # bamba
448440
),
449441

450442
MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -614,9 +606,8 @@ class TensorNameMap:
614606
),
615607

616608
MODEL_TENSOR.SSM_NORM: (
617-
"model.layers.{bid}.mamba.norm", # falcon-h1
609+
"model.layers.{bid}.mamba.norm", # falcon-h1 bamba
618610
"backbone.layers.{bid}.mixer.norm", # mamba2
619-
"model.layers.{bid}.mamba.norm", # bamba
620611
),
621612

622613
MODEL_TENSOR.SSM_OUT: (

0 commit comments

Comments
 (0)