Skip to content

Commit 8a1ea3e

Browse files
committed
feat: Add support for dense FFN in GraniteMoeHybrid
This was already partially supported via reusing the granite ffn builder, and there may be models that leverage this architecture going forward. The naming is a bit odd, but in the transformers version, it reuses the same model class and simply has zero regular experts and a single shared expert (which is the same as a single dense FFN). Branch: GraniteFour Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
1 parent 257d436 commit 8a1ea3e

File tree

2 files changed

+20
-3
lines changed

2 files changed

+20
-3
lines changed

convert_hf_to_gguf.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6538,13 +6538,25 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
65386538
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
65396539
]
65406540

6541+
has_experts = bool(self.hparams.get('num_local_experts'))
6542+
65416543
if name.endswith("shared_mlp.input_linear.weight"):
65426544
ffn_dim = self.hparams["shared_intermediate_size"]
65436545
assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
65446546
gate, up = data_torch.split(ffn_dim, dim=-2)
6547+
if has_experts:
6548+
return [
6549+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6550+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6551+
]
6552+
return [
6553+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), gate),
6554+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), up),
6555+
]
6556+
6557+
if not has_experts and name.endswith("shared_mlp.output_linear.weight"):
65456558
return [
6546-
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
6547-
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
6559+
(self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), data_torch)
65486560
]
65496561

65506562
return super().modify_tensors(data_torch, name, bid)
@@ -6569,7 +6581,7 @@ def modify_tensors(
65696581
) -> Iterable[tuple[str, Tensor]]:
65706582
if (
65716583
name.endswith("block_sparse_moe.input_linear.weight")
6572-
or name.endswith("shared_mlp.input_linear.weight")
6584+
or "shared_mlp" in name
65736585
):
65746586
return GraniteMoeModel.modify_tensors(self, data_torch, name, bid)
65756587
return super().modify_tensors(data_torch, name, bid)

gguf-py/gguf/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2190,13 +2190,18 @@ class MODEL_TENSOR(IntEnum):
21902190
MODEL_TENSOR.ATTN_V,
21912191
MODEL_TENSOR.ATTN_OUT,
21922192
MODEL_TENSOR.FFN_NORM,
2193+
# MoE
21932194
MODEL_TENSOR.FFN_GATE_INP,
21942195
MODEL_TENSOR.FFN_GATE_EXP,
21952196
MODEL_TENSOR.FFN_DOWN_EXP,
21962197
MODEL_TENSOR.FFN_UP_EXP,
21972198
MODEL_TENSOR.FFN_GATE_SHEXP,
21982199
MODEL_TENSOR.FFN_UP_SHEXP,
21992200
MODEL_TENSOR.FFN_DOWN_SHEXP,
2201+
# Dense
2202+
MODEL_TENSOR.FFN_GATE,
2203+
MODEL_TENSOR.FFN_DOWN,
2204+
MODEL_TENSOR.FFN_UP,
22002205
],
22012206
MODEL_ARCH.CHAMELEON: [
22022207
MODEL_TENSOR.TOKEN_EMBD,

0 commit comments

Comments
 (0)