Skip to content

Commit f30e462

Browse files
committed
Merge branch 'master' into crokeso
2 parents 6db6601 + cb887f1 commit f30e462

17 files changed

+916
-310
lines changed

convert_hf_to_gguf.py

Lines changed: 88 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3191,7 +3191,8 @@ def set_gguf_parameters(self):
31913191
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
31923192
num_heads = self.hparams["num_attention_heads"]
31933193
num_kv_heads = self.hparams["num_key_value_heads"]
3194-
head_dim = self.hparams["head_dim"]
3194+
if (head_dim := self.hparams.get("head_dim")) is None:
3195+
head_dim = self.hparams["hidden_size"] // num_heads
31953196

31963197
if "ernie." in name:
31973198
name = name.replace("ernie.", "model.")
@@ -3224,6 +3225,92 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
32243225
return [(self.map_tensor_name(name), data_torch)]
32253226

32263227

3228+
@ModelBase.register("Ernie4_5_MoeForCausalLM")
3229+
class Ernie4_5MoeModel(Ernie4_5Model):
3230+
model_arch = gguf.MODEL_ARCH.ERNIE4_5_MOE
3231+
_experts: list[dict[str, Tensor]] | None = None
3232+
3233+
def __init__(self, *args, **kwargs):
3234+
super().__init__(*args, **kwargs)
3235+
self._experts = [{} for _ in range(self.block_count)]
3236+
3237+
def set_gguf_parameters(self):
3238+
super().set_gguf_parameters()
3239+
self.gguf_writer.add_expert_count(self.hparams["moe_num_experts"])
3240+
self.gguf_writer.add_expert_used_count(self.hparams["moe_k"])
3241+
self.gguf_writer.add_interleave_moe_layer_step(self.hparams["moe_layer_interval"])
3242+
self.gguf_writer.add_leading_dense_block_count(self.hparams["moe_layer_start_index"])
3243+
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
3244+
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
3245+
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
3246+
if (shared_expert_intermediate_size := self.hparams.get('intermediate_size')) is not None and (num_key_value_heads := self.hparams.get('num_key_value_heads')) is not None:
3247+
self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size // num_key_value_heads)
3248+
3249+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3250+
# Modify correction bias name as in DeepseekV2
3251+
if name.endswith("e_score_correction_bias"):
3252+
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
3253+
3254+
# skip Multi-Token Prediction (MTP) layers (again, same as DeepseekV2)
3255+
match = re.match(r"model.mtp_block.(\d+)", name)
3256+
if match:
3257+
return []
3258+
3259+
# skip all other MTP tensors for now
3260+
match = re.match(r"model.mtp_emb_norm.(\d+)", name)
3261+
if match:
3262+
return []
3263+
3264+
match = re.match(r"model.mtp_hidden_norm.(\d+)", name)
3265+
if match:
3266+
return []
3267+
3268+
match = re.match(r"model.mtp_linear_proj.(\d+)", name)
3269+
if match:
3270+
return []
3271+
3272+
# process the experts separately
3273+
if name.find("mlp.experts") != -1:
3274+
n_experts = self.hparams["moe_num_experts"]
3275+
assert bid is not None
3276+
3277+
if self._experts is None:
3278+
self._experts = [{} for _ in range(self.block_count)]
3279+
3280+
self._experts[bid][name] = data_torch
3281+
3282+
if len(self._experts[bid]) >= n_experts * 3:
3283+
tensors: list[tuple[str, Tensor]] = []
3284+
3285+
# merge the experts into a single 3d tensor
3286+
for w_name in ["gate_proj", "up_proj", "down_proj"]:
3287+
datas: list[Tensor] = []
3288+
3289+
for xid in range(n_experts):
3290+
ename_to_retrieve = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
3291+
datas.append(self._experts[bid][ename_to_retrieve])
3292+
del self._experts[bid][ename_to_retrieve]
3293+
3294+
data_torch = torch.stack(datas, dim=0)
3295+
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
3296+
new_name = self.map_tensor_name(merged_name)
3297+
tensors.append((new_name, data_torch))
3298+
3299+
return tensors
3300+
else:
3301+
return []
3302+
return [(self.map_tensor_name(name), data_torch)]
3303+
3304+
def prepare_tensors(self):
3305+
super().prepare_tensors()
3306+
3307+
if self._experts is not None:
3308+
# flatten `list[dict[str, Tensor]]` into `list[str]`
3309+
experts = [k for d in self._experts for k in d.keys()]
3310+
if len(experts) > 0:
3311+
raise ValueError(f"Unprocessed experts: {experts}")
3312+
3313+
32273314
@ModelBase.register(
32283315
"Qwen2VLModel",
32293316
"Qwen2VLForConditionalGeneration",

gguf-py/gguf/constants.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ class MODEL_ARCH(IntEnum):
366366
DOTS1 = auto()
367367
ARCEE = auto()
368368
ERNIE4_5 = auto()
369+
ERNIE4_5_MOE = auto()
369370
HUNYUAN_MOE = auto()
370371
SMOLLM3 = auto()
371372
LFM2 = auto()
@@ -683,6 +684,7 @@ class MODEL_TENSOR(IntEnum):
683684
MODEL_ARCH.DOTS1: "dots1",
684685
MODEL_ARCH.ARCEE: "arcee",
685686
MODEL_ARCH.ERNIE4_5: "ernie4_5",
687+
MODEL_ARCH.ERNIE4_5_MOE: "ernie4_5-moe",
686688
MODEL_ARCH.FALCON_H1: "falcon-h1",
687689
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
688690
MODEL_ARCH.SMOLLM3: "smollm3",
@@ -2025,6 +2027,28 @@ class MODEL_TENSOR(IntEnum):
20252027
MODEL_TENSOR.FFN_UP_SHEXP,
20262028
MODEL_TENSOR.FFN_EXP_PROBS_B,
20272029
],
2030+
MODEL_ARCH.ERNIE4_5_MOE: [
2031+
MODEL_TENSOR.TOKEN_EMBD,
2032+
MODEL_TENSOR.OUTPUT_NORM,
2033+
MODEL_TENSOR.OUTPUT,
2034+
MODEL_TENSOR.ATTN_NORM,
2035+
MODEL_TENSOR.ATTN_Q,
2036+
MODEL_TENSOR.ATTN_K,
2037+
MODEL_TENSOR.ATTN_V,
2038+
MODEL_TENSOR.ATTN_OUT,
2039+
MODEL_TENSOR.FFN_NORM,
2040+
MODEL_TENSOR.FFN_GATE,
2041+
MODEL_TENSOR.FFN_DOWN,
2042+
MODEL_TENSOR.FFN_UP,
2043+
MODEL_TENSOR.FFN_GATE_INP,
2044+
MODEL_TENSOR.FFN_GATE_EXP,
2045+
MODEL_TENSOR.FFN_DOWN_EXP,
2046+
MODEL_TENSOR.FFN_UP_EXP,
2047+
MODEL_TENSOR.FFN_GATE_SHEXP,
2048+
MODEL_TENSOR.FFN_DOWN_SHEXP,
2049+
MODEL_TENSOR.FFN_UP_SHEXP,
2050+
MODEL_TENSOR.FFN_EXP_PROBS_B,
2051+
],
20282052
MODEL_ARCH.PLM: [
20292053
MODEL_TENSOR.TOKEN_EMBD,
20302054
MODEL_TENSOR.OUTPUT,

gguf-py/gguf/tensor_mapping.py

Lines changed: 23 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,8 @@ class TensorNameMap:
324324
),
325325

326326
MODEL_TENSOR.FFN_EXP_PROBS_B: (
327-
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
327+
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
328+
"model.layers.{bid}.mlp.moe_statics.e_score_correction", # ernie4.5-moe
328329
),
329330

330331
# Feed-forward up
@@ -364,13 +365,13 @@ class TensorNameMap:
364365
),
365366

366367
MODEL_TENSOR.FFN_UP_EXP: (
367-
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
368-
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
369-
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
370-
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged)
371-
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
372-
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
373-
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
368+
"layers.{bid}.feed_forward.experts.w3", # mixtral (merged)
369+
"transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged)
370+
"transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx
371+
"model.layers.{bid}.mlp.experts.up_proj", # qwen2moe olmoe (merged) ernie4.5-moe
372+
"model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
373+
"model.layers.{bid}.feed_forward.experts.up_proj", # llama4
374+
"encoder.layers.{bid}.mlp.experts.mlp.w1", # nomic-bert-moe
374375
),
375376

376377
MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -403,12 +404,12 @@ class TensorNameMap:
403404
),
404405

405406
MODEL_TENSOR.FFN_GATE_EXP: (
406-
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
407-
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
408-
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
409-
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged)
410-
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
411-
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
407+
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
408+
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
409+
"transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx
410+
"model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe olmoe (merged) ernie4.5-moe
411+
"model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
412+
"model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
412413
),
413414

414415
MODEL_TENSOR.FFN_GATE_SHEXP: (
@@ -450,14 +451,14 @@ class TensorNameMap:
450451
),
451452

452453
MODEL_TENSOR.FFN_DOWN_EXP: (
453-
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
454-
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
455-
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
456-
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged)
457-
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
458-
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
459-
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
460-
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
454+
"layers.{bid}.feed_forward.experts.w2", # mixtral (merged)
455+
"transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged)
456+
"transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx
457+
"model.layers.{bid}.mlp.experts.down_proj", # qwen2moe olmoe (merged) ernie4.5-moe
458+
"model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
459+
"model.layers.{bid}.block_sparse_moe.experts.w2", # phimoe (merged)
460+
"model.layers.{bid}.feed_forward.experts.down_proj", # llama4
461+
"encoder.layers.{bid}.mlp.experts.mlp.w2", # nomic-bert-moe
461462
),
462463

463464
MODEL_TENSOR.FFN_DOWN_SHEXP: (

include/llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1461,6 +1461,7 @@ extern "C" {
14611461

14621462
int32_t n_p_eval;
14631463
int32_t n_eval;
1464+
int32_t n_reused; // number of times a ggml compute graph had been reused
14641465
};
14651466

14661467
struct llama_perf_sampler_data {

src/llama-arch.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8282
{ LLM_ARCH_DOTS1, "dots1" },
8383
{ LLM_ARCH_ARCEE, "arcee" },
8484
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
85+
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
8586
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
8687
{ LLM_ARCH_SMOLLM3, "smollm3" },
8788
{ LLM_ARCH_LFM2, "lfm2" },
@@ -1825,6 +1826,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
18251826
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
18261827
},
18271828
},
1829+
{
1830+
LLM_ARCH_ERNIE4_5_MOE,
1831+
{
1832+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1833+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1834+
{ LLM_TENSOR_OUTPUT, "output" },
1835+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1836+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1837+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1838+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1839+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1840+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1841+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1842+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1843+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1844+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1845+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1846+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1847+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1848+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1849+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1850+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1851+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1852+
},
1853+
},
18281854
{
18291855
LLM_ARCH_HUNYUAN_MOE,
18301856
{

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ enum llm_arch {
8686
LLM_ARCH_DOTS1,
8787
LLM_ARCH_ARCEE,
8888
LLM_ARCH_ERNIE4_5,
89+
LLM_ARCH_ERNIE4_5_MOE,
8990
LLM_ARCH_HUNYUAN_MOE,
9091
LLM_ARCH_SMOLLM3,
9192
LLM_ARCH_LFM2,

0 commit comments

Comments
 (0)