From f5d8a22766a3e174a22aedf14f18113d5a1f12e7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 27 Jun 2025 18:02:27 +0200 Subject: [PATCH 01/22] model : add hunyuan moe --- convert_hf_to_gguf.py | 120 ++++++++++++++++++++++++++++++++- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/constants.py | 23 +++++++ gguf-py/gguf/tensor_mapping.py | 6 ++ 4 files changed, 148 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index aed595e259ed5..82f0000377cdd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -14,7 +14,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, Dict, cast from itertools import chain from transformers import AutoConfig @@ -607,7 +607,6 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) @@ -815,6 +814,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35": # ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 res = "minerva-7b" + if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": + # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct + res = "hunyuan" if res is None: logger.warning("\n") @@ -6390,6 +6392,120 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) + +@ModelBase.register("HunYuanMoEV1ForCausalLM") +class HunYuanMoEModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE + undo_permute = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_vocab(self): + self._set_vocab_gpt2() + + def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: + import base64 + dic = {} + rank = 0 + for line in open(tiktoken_bpe_file, "rb"): + if line: + token, _ = line.split() + if base64.b64decode(token) in dic: + continue + dic[base64.b64decode(token)] = int(rank) + rank += 1 + global SPECIAL_START_ID + SPECIAL_START_ID=rank + return dic + + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + print(tokenizer) + print(tokenizer.tokenizer) + print(type(tokenizer.decoder)) + # exit(0) + + reverse_vocab = tokenizer.decoder + assert max(reverse_vocab.keys()) < tokenizer.vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(tokenizer.vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + return tokens, toktypes, tokpre + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + tensors: list[tuple[str, Tensor]] = [] + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2f733f0973686..96a2b692a86c1 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -137,6 +137,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"}, {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"}, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"}, + {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"}, ] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index fb75143b0b545..e0a4f688ac7a8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -354,6 +354,7 @@ class MODEL_ARCH(IntEnum): BAILINGMOE = auto() DOTS1 = auto() ARCEE = auto() + HUNYUAN_MOE = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -654,6 +655,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.BAILINGMOE: "bailingmoe", MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", + MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -2177,6 +2179,27 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.HUNYUAN_MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b30f77dbe3be7..ebc07043147b8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -303,6 +303,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe "model.layers.{bid}.feed_forward.router", # llama4 "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe + "model.layers.{bid}.mlp.gate.wg.weight", # hunyuan ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -362,6 +363,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 + "model.layers.{bid}.mlp.shared_mlp.up_proj.weight", # hunyuan ), # AWQ-activation gate @@ -398,6 +400,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 + "model.layers.{bid}.mlp.shared_mlp.gate_proj.weight", # hunyuan ), # Feed-forward down @@ -447,11 +450,13 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 "model.layers.{bid}.shared_mlp.output_linear", # granitemoe + "model.layers.{bid}.mlp.shared_mlp.down_proj.weight", # hunyuan ), MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "model.layers.{bid}.self_attn.query_layernorm", # hunyuan "model.layers.{bid}.self_attn.q_norm", # cohere olmoe chameleon olmo2 "transformer.blocks.{bid}.attn.q_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_q", # jina-bert-v2 @@ -461,6 +466,7 @@ class TensorNameMap: MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "model.layers.{bid}.self_attn.key_layernorm", # hunyuan "model.layers.{bid}.self_attn.k_norm", # cohere olmoe chameleon olmo2 "transformer.blocks.{bid}.attn.k_ln", # sea-lion "encoder.layer.{bid}.attention.self.layer_norm_k", # jina-bert-v2 From 38acf7fe9f653bfb3b2aac319745514dd803adf1 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 27 Jun 2025 18:49:13 +0200 Subject: [PATCH 02/22] tokenizer ok --- convert_hf_to_gguf.py | 31 ++++++++++++------------------- include/llama.h | 1 + src/llama-vocab.cpp | 5 +++++ 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 82f0000377cdd..037f4cff887f1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6404,31 +6404,24 @@ def __init__(self, *args, **kwargs): def set_vocab(self): self._set_vocab_gpt2() - def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: - import base64 - dic = {} - rank = 0 - for line in open(tiktoken_bpe_file, "rb"): - if line: - token, _ = line.split() - if base64.b64decode(token) in dic: - continue - dic[base64.b64decode(token)] = int(rank) - rank += 1 - global SPECIAL_START_ID - SPECIAL_START_ID=rank - return dic - def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - print(tokenizer) - print(tokenizer.tokenizer) - print(type(tokenizer.decoder)) - # exit(0) + + merges = [] + vocab = {} + mergeable_ranks = tokenizer.mergeable_ranks + for token, rank in mergeable_ranks.items(): + vocab[QwenModel.token_bytes_to_string(token)] = rank + if len(token) == 1: + continue + merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) + if len(merged) == 2: + merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) + self.gguf_writer.add_token_merges(merges) reverse_vocab = tokenizer.decoder assert max(reverse_vocab.keys()) < tokenizer.vocab_size diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..dc86aea41dcbd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -117,6 +117,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 5c9eb87566dde..551bba171c0e0 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: + case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: regex_exprs = { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" @@ -1656,6 +1657,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "seed-coder") { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; + } else if ( + tokenizer_pre == "hunyuan") { + pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From 35591a9a741de4267e8f3d74b111487d32aee6d5 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 27 Jun 2025 19:05:20 +0200 Subject: [PATCH 03/22] fix tensor name --- convert_hf_to_gguf.py | 3 ++- gguf-py/gguf/tensor_mapping.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 037f4cff887f1..155df408f01ad 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -14,7 +14,7 @@ from enum import IntEnum from pathlib import Path from hashlib import sha256 -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, Dict, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Literal, Sequence, TypeVar, cast from itertools import chain from transformers import AutoConfig @@ -6411,6 +6411,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + # merge logic is copied from QwenModel, maybe incorrect merges = [] vocab = {} mergeable_ranks = tokenizer.mergeable_ranks diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ebc07043147b8..87d95c776ed9a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -303,7 +303,7 @@ class TensorNameMap: "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe "model.layers.{bid}.feed_forward.router", # llama4 "encoder.layers.{bid}.mlp.router.layer", # nomic-bert-moe - "model.layers.{bid}.mlp.gate.wg.weight", # hunyuan + "model.layers.{bid}.mlp.gate.wg", # hunyuan ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( @@ -363,7 +363,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 - "model.layers.{bid}.mlp.shared_mlp.up_proj.weight", # hunyuan + "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan ), # AWQ-activation gate @@ -400,7 +400,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4 - "model.layers.{bid}.mlp.shared_mlp.gate_proj.weight", # hunyuan + "model.layers.{bid}.mlp.shared_mlp.gate_proj", # hunyuan ), # Feed-forward down @@ -450,7 +450,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4 "model.layers.{bid}.shared_mlp.output_linear", # granitemoe - "model.layers.{bid}.mlp.shared_mlp.down_proj.weight", # hunyuan + "model.layers.{bid}.mlp.shared_mlp.down_proj", # hunyuan ), MODEL_TENSOR.ATTN_Q_NORM: ( From cb1f9f2d79e4db6e67a6db9ac66ad8b7df148273 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 27 Jun 2025 19:35:18 +0200 Subject: [PATCH 04/22] cgraph init --- convert_hf_to_gguf.py | 11 +++ src/llama-arch.cpp | 25 +++++ src/llama-arch.h | 1 + src/llama-model.cpp | 223 ++++++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 1 + 5 files changed, 261 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 155df408f01ad..12de60442a43e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6463,6 +6463,17 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: def set_gguf_parameters(self): super().set_gguf_parameters() + self.gguf_writer.add_expert_count(self.hparams["num_experts"]) + self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["intermediate_size"]) + + moe_intermediate_size = self.hparams["moe_intermediate_size"] + assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size) + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0]) + + moe_topk = self.hparams["moe_topk"] + assert all(topk == moe_topk[0] for topk in moe_topk) + self.gguf_writer.add_expert_used_count(moe_topk[0]) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("mlp.experts") != -1: diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 435e3b9ba3db8..403dfe8fcc028 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -76,6 +76,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_ARCEE, "arcee" }, + { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1658,6 +1659,30 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, } }, + { + LLM_ARCH_HUNYUAN_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_UNKNOWN, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 9181ad053f6b3..14617d2caf061 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -80,6 +80,7 @@ enum llm_arch { LLM_ARCH_BAILINGMOE, LLM_ARCH_DOTS1, LLM_ARCH_ARCEE, + LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc39195ed5177..d5ec4c6b2f045 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -101,6 +101,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_57B_A14B: return "57B.A14B"; case LLM_TYPE_17B_16E: return "17Bx16E (Scout)"; case LLM_TYPE_17B_128E: return "17Bx128E (Maverick)"; + case LLM_TYPE_A13B: return "A13B"; case LLM_TYPE_30B_A3B: return "30B.A3B"; case LLM_TYPE_235B_A22B: return "235B.A22B"; case LLM_TYPE_E2B: return "E2B"; @@ -1504,6 +1505,18 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_HUNYUAN_MOE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + hparams.n_ff_exp = hparams.n_ff(0); + hparams.n_ff_shexp = hparams.n_ff_exp; + + switch (hparams.n_layer) { + case 32: type = LLM_TYPE_A13B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -4348,6 +4361,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_HUNYUAN_MOE: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); + + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, hparams.n_ff_shexp}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, 0); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -14260,6 +14310,174 @@ struct llm_build_arcee : public llm_graph_context { } }; +struct llm_build_hunyuan_moe : public llm_graph_context { + llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + if (model.layers[il].attn_k_norm) { + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, + LLM_NORM_RMS, il); + cb(Kcur, "Kcur_norm", il); + } + + if (model.layers[il].attn_q_norm) { + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, + LLM_NORM_RMS, il); + cb(Qcur, "Qcur_norm", il); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, gf, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + ffn_inp = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // feed-forward network (non-MoE) + ggml_tensor * cur_mlp = nullptr; + { + cur_mlp = build_ffn(ffn_inp, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur_mlp, "ffn_out", il); + } + + // MoE branch + ggml_tensor * cur_moe = nullptr; + { + cur_moe = build_moe_ffn(ffn_inp, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur_moe, "ffn_moe_out", il); + } + + cur = ggml_add(ctx0, ggml_add(ctx0, cur_moe, cur_mlp), ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -14635,6 +14853,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_HUNYUAN_MOE: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -14786,6 +15008,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_BAILINGMOE: case LLM_ARCH_NEO_BERT: case LLM_ARCH_ARCEE: + case LLM_ARCH_HUNYUAN_MOE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/src/llama-model.h b/src/llama-model.h index 40063b790d434..5bbfa108fc9e0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -93,6 +93,7 @@ enum llm_type { LLM_TYPE_57B_A14B, LLM_TYPE_17B_16E, // llama4 Scout LLM_TYPE_17B_128E, // llama4 Maverick + LLM_TYPE_A13B, LLM_TYPE_30B_A3B, LLM_TYPE_235B_A22B, LLM_TYPE_E2B, From 51886a47ad9fd2b539d8c0fdc1f796ce24a1579c Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 27 Jun 2025 19:35:22 +0200 Subject: [PATCH 05/22] chat template --- src/llama-chat.cpp | 18 ++++++++++++++++++ src/llama-chat.h | 1 + 2 files changed, 19 insertions(+) diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 5d317f4ee62eb..d05335a685b49 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -64,6 +64,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "bailing", LLM_CHAT_TEMPLATE_BAILING }, { "llama4", LLM_CHAT_TEMPLATE_LLAMA4 }, { "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM }, + { "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -185,6 +186,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_LLAMA4; } else if (tmpl_contains("<|endofuserprompt|>")) { return LLM_CHAT_TEMPLATE_DOTS1; + } else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) { + return LLM_CHAT_TEMPLATE_HUNYUAN_MOE; } return LLM_CHAT_TEMPLATE_UNKNOWN; } @@ -665,6 +668,21 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|response|>"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_MOE) { + // tencent/Hunyuan-A13B-Instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "<|startoftext|>" << message->content << "<|extra_4|>"; + } else if (role == "assistant") { + ss << "<|startoftext|>" << message->content << "<|eos|>"; + } else { + ss << "<|startoftext|>" << message->content << "<|extra_0|>"; + } + } + if (add_ass) { + ss << "<|startoftext|>"; + } } else { // template not supported return -1; diff --git a/src/llama-chat.h b/src/llama-chat.h index 38800010ae48b..b621fda281669 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -44,6 +44,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_LLAMA4, LLM_CHAT_TEMPLATE_SMOLVLM, LLM_CHAT_TEMPLATE_DOTS1, + LLM_CHAT_TEMPLATE_HUNYUAN_MOE, LLM_CHAT_TEMPLATE_UNKNOWN, }; From cff16cc6c232e44d13559a551ac2b51014bcccd9 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 28 Jun 2025 01:09:08 +0200 Subject: [PATCH 06/22] wip --- src/llama-graph.cpp | 7 ++++ src/llama-model.cpp | 87 ++++++++++++++++++++------------------------- 2 files changed, 46 insertions(+), 48 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 71ee431a977ba..a299c89ecc7e4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -705,6 +705,13 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] cb(weights, "ffn_moe_weights", il); + if (arch == LLM_ARCH_HUNYUAN_MOE) { + weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_expert_used, n_tokens] + weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [1, n_tokens] + weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_scaled", il); + } + if (norm_w) { weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d5ec4c6b2f045..86382aba5934c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1507,10 +1507,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { } break; case LLM_ARCH_HUNYUAN_MOE: { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - - hparams.n_ff_exp = hparams.n_ff(0); - hparams.n_ff_shexp = hparams.n_ff_exp; + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); switch (hparams.n_layer) { case 32: type = LLM_TYPE_A13B; break; @@ -14377,29 +14376,25 @@ struct llm_build_hunyuan_moe : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + Kcur = ggml_rope_ext( ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); - if (model.layers[il].attn_k_norm) { - Kcur = build_norm(Kcur, - model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, - LLM_NORM_RMS, il); - cb(Kcur, "Kcur_norm", il); - } - - if (model.layers[il].attn_q_norm) { - Qcur = build_norm(Qcur, - model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, - LLM_NORM_RMS, il); - cb(Qcur, "Qcur_norm", il); - } + Kcur = build_norm(Kcur, + model.layers[il].attn_k_norm, nullptr, + LLM_NORM_RMS, il); + cb(Kcur, "Kcur_norm", il); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - cb(Vcur, "Vcur", il); + Qcur = build_norm(Qcur, + model.layers[il].attn_q_norm, nullptr, + LLM_NORM_RMS, il); + cb(Qcur, "Qcur_norm", il); cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, @@ -14415,42 +14410,38 @@ struct llm_build_hunyuan_moe : public llm_graph_context { ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il); - ffn_inp = build_norm(ffn_inp, + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); // feed-forward network (non-MoE) - ggml_tensor * cur_mlp = nullptr; - { - cur_mlp = build_ffn(ffn_inp, - model.layers[il].ffn_up_shexp, NULL, NULL, - model.layers[il].ffn_gate_shexp, NULL, NULL, - model.layers[il].ffn_down_shexp, NULL, NULL, - NULL, - LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur_mlp, "ffn_out", il); - } + ggml_tensor * cur_mlp = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur_mlp, "ffn_mlp", il); // MoE branch - ggml_tensor * cur_moe = nullptr; - { - cur_moe = build_moe_ffn(ffn_inp, - model.layers[il].ffn_gate_inp, - model.layers[il].ffn_up_exps, - model.layers[il].ffn_gate_exps, - model.layers[il].ffn_down_exps, - nullptr, - n_expert, n_expert_used, - LLM_FFN_SILU, true, - false, 0.0, - LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, - il); - cb(cur_moe, "ffn_moe_out", il); - } + ggml_tensor * cur_moe = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, 0.0, + LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + il); + cb(cur_moe, "ffn_moe_out", il); - cur = ggml_add(ctx0, ggml_add(ctx0, cur_moe, cur_mlp), ffn_inp); - cb(cur, "ffn_out", il); + ggml_tensor * ffn_out = ggml_add(ctx0, cur_moe, cur_mlp); + cb(ffn_out, "ffn_out", il); + + cur = ggml_add(ctx0, ffn_out, ffn_inp); cur = build_cvec(cur, il); cb(cur, "l_out", il); From 5e78e887369a12aaab450ec713507e646a44c45b Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 27 Jun 2025 18:38:13 -0500 Subject: [PATCH 07/22] almost working --- convert_hf_to_gguf.py | 74 ++++++++++++++++++++++--------------- gguf-py/gguf/constants.py | 1 + gguf-py/gguf/gguf_writer.py | 3 ++ 3 files changed, 48 insertions(+), 30 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 12de60442a43e..1ad4d913435f5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -842,14 +842,14 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") - def _set_vocab_gpt2(self) -> None: + def _set_vocab_gpt2(self, load_merges=True) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_qwen(self): @@ -6394,15 +6394,14 @@ def set_gguf_parameters(self): @ModelBase.register("HunYuanMoEV1ForCausalLM") -class HunYuanMoEModel(LlamaModel): +class HunYuanMoEModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE - undo_permute = False def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def set_vocab(self): - self._set_vocab_gpt2() + self._set_vocab_gpt2(load_merges=False) def get_vocab_base(self) -> tuple[list[str], list[int], str]: tokens: list[str] = [] @@ -6411,52 +6410,41 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - # merge logic is copied from QwenModel, maybe incorrect merges = [] - vocab = {} mergeable_ranks = tokenizer.mergeable_ranks for token, rank in mergeable_ranks.items(): - vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: continue + # bpe() will decompose the token into its smallest parts and then + # re-merge them. If the token is a valid merge, bpe() will return + # the two pieces that were merged to create it. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) if len(merged) == 2: merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) self.gguf_writer.add_token_merges(merges) + vocab_size = self.hparams["vocab_size"] + reverse_vocab = tokenizer.decoder - assert max(reverse_vocab.keys()) < tokenizer.vocab_size + assert max(reverse_vocab.keys()) < tokenizer.vocab_size, tokenizer.vocab_size == vocab_size tokpre = self.get_vocab_base_pre(tokenizer) - added_vocab = tokenizer.get_added_vocab() + special_token_ids = set(tokenizer.special_tokens.values()) - added_tokens_decoder = tokenizer.added_tokens_decoder + tokens: list[str] = [] + toktypes: list[int] = [] - for i in range(tokenizer.vocab_size): + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.UNUSED) else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) - if previous_token != token: - logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") - - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - # NOTE: this was added for Gemma. - # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces - toktypes.append(gguf.TokenType.USER_DEFINED) + token = reverse_vocab[i] + tokens.append(token) + if i in special_token_ids: + toktypes.append(gguf.TokenType.CONTROL) else: toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) return tokens, toktypes, tokpre @@ -6474,6 +6462,25 @@ def set_gguf_parameters(self): assert all(topk == moe_topk[0] for topk in moe_topk) self.gguf_writer.add_expert_used_count(moe_topk[0]) + moe_shared_expert = self.hparams["num_shared_expert"] + assert all(n == moe_shared_expert[0] for n in moe_shared_expert) + self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) + + self.gguf_writer.add_qk_norm(self.hparams.get("use_qk_norm", True)) + + # Rope + rope_scaling = self.hparams.get("rope_scaling", {}) + if rope_scaling.get("type") == "dynamic": + logger.warning("Model uses 'dynamic' rope scaling, which is not yet supported in GGUF. " + "The resulting model may not work correctly with contexts longer than the training length.") + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + else: + # Fallback for other potential scaling types + # This part is inherited from TextModel and will handle standard rope_theta + pass + + _experts: list[dict[str, Tensor]] | None = None + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately if name.find("mlp.experts") != -1: @@ -6511,6 +6518,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] + def prepare_tensors(self): + super().prepare_tensors() + if self._experts is not None: + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e0a4f688ac7a8..31eb66cef2de4 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -148,6 +148,7 @@ class Attention: VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" + QK_NORM = "{arch}.attention.qk_norm" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d32cd479adb17..8096baf244da3 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -792,6 +792,9 @@ def add_group_norm_groups(self, value: int) -> None: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) + def add_qk_norm(self, value: bool) -> None: + self.add_bool(Keys.Attention.QK_NORM.format(arch=self.arch), value) + def add_q_lora_rank(self, length: int) -> None: self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length) From d219580756922e2356ecf641521f2458a1a623fa Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 27 Jun 2025 19:03:04 -0500 Subject: [PATCH 08/22] skip embed, fix bos --- convert_hf_to_gguf.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1ad4d913435f5..0fe28b8396f41 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6399,25 +6399,25 @@ class HunYuanMoEModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + # FIX for tied embeddings: Capture the token embeddings. + self._tok_embd = None def set_vocab(self): self._set_vocab_gpt2(load_merges=False) + # FIX for BOS token: Manually set the correct BOS token ID. + # The SpecialVocab helper gets incorrect id `bos_token_id: 1` from config.json. + self.gguf_writer.add_bos_token_id(127959) # <|bos|> def get_vocab_base(self) -> tuple[list[str], list[int], str]: - tokens: list[str] = [] - toktypes: list[int] = [] - from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + # Fake merges merges = [] mergeable_ranks = tokenizer.mergeable_ranks for token, rank in mergeable_ranks.items(): if len(token) == 1: continue - # bpe() will decompose the token into its smallest parts and then - # re-merge them. If the token is a valid merge, bpe() will return - # the two pieces that were merged to create it. merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) if len(merged) == 2: merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) @@ -6472,16 +6472,22 @@ def set_gguf_parameters(self): rope_scaling = self.hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": logger.warning("Model uses 'dynamic' rope scaling, which is not yet supported in GGUF. " - "The resulting model may not work correctly with contexts longer than the training length.") + "Long-context extrapolation will not work correctly. Setting rope scaling type to NONE.") self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) - else: - # Fallback for other potential scaling types - # This part is inherited from TextModel and will handle standard rope_theta - pass _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # FIX for tied embeddings: Capture the token embeddings. + if name == "model.embed_tokens.weight": + self._tok_embd = data_torch.clone() + + # FIX for tied embeddings: Skip the lm_head if it's tied. + if name == "lm_head.weight": + if self.hparams.get("tie_word_embeddings", False): + logger.info("Skipping tied output layer 'lm_head.weight'") + return [] + # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] From 0fd393087d3bfbcaf7db07aaa5ceb557ff8d1dfa Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 27 Jun 2025 20:44:16 -0500 Subject: [PATCH 09/22] cleanup --- src/llama-arch.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 403dfe8fcc028..2584dd9c248a5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1672,7 +1672,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, - { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, From b19ecae2dc1fbd998c6e0cc14ca933f06b5a5af7 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 27 Jun 2025 20:54:56 -0500 Subject: [PATCH 10/22] yarn scaling --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0fe28b8396f41..62e93adc5ce5b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6471,9 +6471,9 @@ def set_gguf_parameters(self): # Rope rope_scaling = self.hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": - logger.warning("Model uses 'dynamic' rope scaling, which is not yet supported in GGUF. " - "Long-context extrapolation will not work correctly. Setting rope scaling type to NONE.") - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["max_position_embeddings"]) _experts: list[dict[str, Tensor]] | None = None From 245db1592003253cf90baf1698a5a45d8aa2acd1 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 27 Jun 2025 22:52:09 -0500 Subject: [PATCH 11/22] cleanup --- convert_hf_to_gguf.py | 73 +++++++++++++++++++------------------------ src/llama-graph.cpp | 7 ----- src/llama-model.cpp | 6 ++-- 3 files changed, 36 insertions(+), 50 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 62e93adc5ce5b..ecfcc8125a885 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6399,20 +6399,22 @@ class HunYuanMoEModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # FIX for tied embeddings: Capture the token embeddings. + # For handling tied embeddings self._tok_embd = None def set_vocab(self): - self._set_vocab_gpt2(load_merges=False) - # FIX for BOS token: Manually set the correct BOS token ID. - # The SpecialVocab helper gets incorrect id `bos_token_id: 1` from config.json. - self.gguf_writer.add_bos_token_id(127959) # <|bos|> - - def get_vocab_base(self) -> tuple[list[str], list[int], str]: + """ + A self-contained vocab implementation for the HunYuan tiktoken-based tokenizer. + This method correctly generates tokens, types, and the required "fake" merges + to satisfy the llama.cpp GGUF loader. + """ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - # Fake merges + # 1. Get the pre-tokenizer identifier hash + tokpre = self.get_vocab_base_pre(tokenizer) + + # 2. Reverse-engineer the merges list from mergeable_ranks merges = [] mergeable_ranks = tokenizer.mergeable_ranks for token, rank in mergeable_ranks.items(): @@ -6421,19 +6423,13 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) if len(merged) == 2: merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) - self.gguf_writer.add_token_merges(merges) + # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - reverse_vocab = tokenizer.decoder - assert max(reverse_vocab.keys()) < tokenizer.vocab_size, tokenizer.vocab_size == vocab_size - - tokpre = self.get_vocab_base_pre(tokenizer) special_token_ids = set(tokenizer.special_tokens.values()) - tokens: list[str] = [] toktypes: list[int] = [] - for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -6446,30 +6442,42 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: toktypes.append(gguf.TokenType.NORMAL) - return tokens, toktypes, tokpre + # 4. Write all vocab-related fields to the GGUF writer + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_token_merges(merges) + + # 5. Add special tokens and chat templates + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) + special_vocab.add_to_gguf(self.gguf_writer) + # FIX for BOS token: Manually set the correct BOS token ID. + self.gguf_writer.add_bos_token_id(127959) # <|bos|> def set_gguf_parameters(self): super().set_gguf_parameters() + hparams = self.hparams - self.gguf_writer.add_expert_count(self.hparams["num_experts"]) - self.gguf_writer.add_expert_shared_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_expert_count(hparams["num_experts"]) + self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"]) - moe_intermediate_size = self.hparams["moe_intermediate_size"] + moe_intermediate_size = hparams["moe_intermediate_size"] assert all(n == moe_intermediate_size[0] for n in moe_intermediate_size) self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size[0]) - moe_topk = self.hparams["moe_topk"] + moe_topk = hparams["moe_topk"] assert all(topk == moe_topk[0] for topk in moe_topk) self.gguf_writer.add_expert_used_count(moe_topk[0]) - moe_shared_expert = self.hparams["num_shared_expert"] + moe_shared_expert = hparams["num_shared_expert"] assert all(n == moe_shared_expert[0] for n in moe_shared_expert) self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) - self.gguf_writer.add_qk_norm(self.hparams.get("use_qk_norm", True)) + self.gguf_writer.add_qk_norm(hparams.get("use_qk_norm", True)) # Rope - rope_scaling = self.hparams.get("rope_scaling", {}) + rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) @@ -6478,50 +6486,33 @@ def set_gguf_parameters(self): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # FIX for tied embeddings: Capture the token embeddings. if name == "model.embed_tokens.weight": self._tok_embd = data_torch.clone() - - # FIX for tied embeddings: Skip the lm_head if it's tied. if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") return [] - - # process the experts separately if name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None - - tensors: list[tuple[str, Tensor]] = [] - if self._experts is None: self._experts = [{} for _ in range(self.block_count)] - self._experts[bid][name] = data_torch - if len(self._experts[bid]) >= n_experts * 3: - # merge the experts into a single 3d tensor + tensors: list[tuple[str, Tensor]] = [] for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] - for xid in range(n_experts): ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] - data_torch = torch.stack(datas, dim=0) - merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" - new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors else: return [] - return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index a299c89ecc7e4..71ee431a977ba 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -705,13 +705,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] cb(weights, "ffn_moe_weights", il); - if (arch == LLM_ARCH_HUNYUAN_MOE) { - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_expert_used, n_tokens] - weights = ggml_div(ctx0, weights, ggml_sum_rows(ctx0, weights)); // [1, n_tokens] - weights = ggml_reshape_3d(ctx0, weights, 1, n_expert_used, n_tokens); // [1, n_expert_used, n_tokens] - cb(weights, "ffn_moe_weights_scaled", il); - } - if (norm_w) { weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 86382aba5934c..3a25a2ccad377 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14432,8 +14432,10 @@ struct llm_build_hunyuan_moe : public llm_graph_context { model.layers[il].ffn_down_exps, nullptr, n_expert, n_expert_used, - LLM_FFN_SILU, false, - false, 0.0, + LLM_FFN_SILU, + true, // norm_topk_prob + false, + 0.0, LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il); cb(cur_moe, "ffn_moe_out", il); From 3920faa24252a532e7d26b160901dcf9fcf09e96 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 28 Jun 2025 23:09:35 +0200 Subject: [PATCH 12/22] correct rope type --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 86382aba5934c..75a9ada0831a3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -14326,7 +14326,7 @@ struct llm_build_hunyuan_moe : public llm_graph_context { auto * inp_attn = build_attn_inp_kv_unified(); - const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); ggml_tensor * inp_out_ids = build_inp_out_ids(); @@ -14999,7 +14999,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_BAILINGMOE: case LLM_ARCH_NEO_BERT: case LLM_ARCH_ARCEE: - case LLM_ARCH_HUNYUAN_MOE: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -15035,6 +15034,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_EXAONE: case LLM_ARCH_MINICPM3: case LLM_ARCH_DOTS1: + case LLM_ARCH_HUNYUAN_MOE: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: From 8fd547bd514abb65cd35c56c94e52c9400733653 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Sat, 28 Jun 2025 17:12:38 -0500 Subject: [PATCH 13/22] failed token fix --- convert_hf_to_gguf.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ecfcc8125a885..cd8dfd1652e01 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6416,18 +6416,21 @@ def set_vocab(self): # 2. Reverse-engineer the merges list from mergeable_ranks merges = [] + vocab = {} mergeable_ranks = tokenizer.mergeable_ranks for token, rank in mergeable_ranks.items(): + #vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: + if len(merged) == 2: #todo this is an assert in Qwen, why? merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - reverse_vocab = tokenizer.decoder special_token_ids = set(tokenizer.special_tokens.values()) + reverse_vocab = tokenizer.decoder + #reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_token_ids}.items()} tokens: list[str] = [] toktypes: list[int] = [] for i in range(vocab_size): From 4d66bdc0b6580eb94eaf829ace523b4bc9d547d4 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sun, 29 Jun 2025 01:15:35 +0200 Subject: [PATCH 14/22] ntk alpha freq_base --- src/llama-model.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 75a9ada0831a3..172a1d7409caa 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1511,6 +1511,11 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); + // TODO: read from gguf + float n_dim = hparams.n_embd_head_k; + float alpha = 1000.0f; // NTK-Aware + hparams.rope_freq_base_train = 10000.0f * std::powf(alpha, n_dim / (n_dim - 2.0f)); + switch (hparams.n_layer) { case 32: type = LLM_TYPE_A13B; break; default: type = LLM_TYPE_UNKNOWN; From b20bd2639a9b42be5d90b496ee8a1ac966b958fd Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Sat, 28 Jun 2025 19:44:31 -0500 Subject: [PATCH 15/22] tokenization working --- convert_hf_to_gguf.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cd8dfd1652e01..9d39e0597ad17 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -346,6 +346,8 @@ def prepare_tensors(self): data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0: + data_qtype = gguf.GGMLQuantizationType.Q4_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: data_qtype = gguf.GGMLQuantizationType.TQ1_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: @@ -6419,7 +6421,7 @@ def set_vocab(self): vocab = {} mergeable_ranks = tokenizer.mergeable_ranks for token, rank in mergeable_ranks.items(): - #vocab[QwenModel.token_bytes_to_string(token)] = rank + vocab[QwenModel.token_bytes_to_string(token)] = rank if len(token) == 1: continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) @@ -6428,9 +6430,8 @@ def set_vocab(self): # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] - special_token_ids = set(tokenizer.special_tokens.values()) - reverse_vocab = tokenizer.decoder - #reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_token_ids}.items()} + special_tokens = tokenizer.special_tokens + reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] toktypes: list[int] = [] for i in range(vocab_size): @@ -6440,7 +6441,7 @@ def set_vocab(self): else: token = reverse_vocab[i] tokens.append(token) - if i in special_token_ids: + if i in special_tokens.values(): toktypes.append(gguf.TokenType.CONTROL) else: toktypes.append(gguf.TokenType.NORMAL) @@ -6614,7 +6615,7 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q4_0", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( @@ -6746,6 +6747,7 @@ def main() -> None: "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, From 1221d944ca3b5e97426c5b31aa9d053ac0a323c7 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Sun, 29 Jun 2025 08:52:41 -0500 Subject: [PATCH 16/22] cleanup and pr changes --- convert_hf_to_gguf.py | 25 +++++++++++++++---------- gguf-py/gguf/constants.py | 1 - gguf-py/gguf/gguf_writer.py | 3 --- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9d39e0597ad17..d4be8f96248a4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -844,14 +844,14 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") - def _set_vocab_gpt2(self, load_merges=True) -> None: + def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_qwen(self): @@ -6405,11 +6405,6 @@ def __init__(self, *args, **kwargs): self._tok_embd = None def set_vocab(self): - """ - A self-contained vocab implementation for the HunYuan tiktoken-based tokenizer. - This method correctly generates tokens, types, and the required "fake" merges - to satisfy the llama.cpp GGUF loader. - """ from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) @@ -6456,7 +6451,7 @@ def set_vocab(self): # 5. Add special tokens and chat templates special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False) special_vocab.add_to_gguf(self.gguf_writer) - # FIX for BOS token: Manually set the correct BOS token ID. + # FIX for BOS token: Overwrite incorrect id read from config.json self.gguf_writer.add_bos_token_id(127959) # <|bos|> def set_gguf_parameters(self): @@ -6478,11 +6473,11 @@ def set_gguf_parameters(self): assert all(n == moe_shared_expert[0] for n in moe_shared_expert) self.gguf_writer.add_expert_shared_count(moe_shared_expert[0]) - self.gguf_writer.add_qk_norm(hparams.get("use_qk_norm", True)) - # Rope rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": + # Not sure if YARN is correct here, and the factor in the config is only 1 anyway + # but the release claims to scale to 256k, which would be a factor of 8 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["max_position_embeddings"]) @@ -6492,31 +6487,41 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name == "model.embed_tokens.weight": self._tok_embd = data_torch.clone() + if name == "lm_head.weight": if self.hparams.get("tie_word_embeddings", False): logger.info("Skipping tied output layer 'lm_head.weight'") return [] + if name.find("mlp.experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None + if self._experts is None: self._experts = [{} for _ in range(self.block_count)] + self._experts[bid][name] = data_torch + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor tensors: list[tuple[str, Tensor]] = [] for w_name in ["down_proj", "gate_proj", "up_proj"]: datas: list[Tensor] = [] + for xid in range(n_experts): ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" datas.append(self._experts[bid][ename]) del self._experts[bid][ename] + data_torch = torch.stack(datas, dim=0) merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" new_name = self.map_tensor_name(merged_name) tensors.append((new_name, data_torch)) + return tensors else: return [] + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 31eb66cef2de4..e0a4f688ac7a8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -148,7 +148,6 @@ class Attention: VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" - QK_NORM = "{arch}.attention.qk_norm" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 8096baf244da3..d32cd479adb17 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -792,9 +792,6 @@ def add_group_norm_groups(self, value: int) -> None: def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) - def add_qk_norm(self, value: bool) -> None: - self.add_bool(Keys.Attention.QK_NORM.format(arch=self.arch), value) - def add_q_lora_rank(self, length: int) -> None: self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length) From 5471f5acf2ca4d8ca7500b4aa63616d6b9fb5368 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Sun, 29 Jun 2025 08:59:36 -0500 Subject: [PATCH 17/22] vocab_size sanity check --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d4be8f96248a4..88ebf4fa6d4f4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6425,6 +6425,7 @@ def set_vocab(self): # 3. Generate the tokens and toktypes lists vocab_size = self.hparams["vocab_size"] + assert tokenizer.vocab_size == vocab_size special_tokens = tokenizer.special_tokens reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()} tokens: list[str] = [] From 46c8b70cbc7346db95e45ebae4f1e0c68a9b8d86 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Sun, 29 Jun 2025 19:57:01 -0500 Subject: [PATCH 18/22] ntk alpha generic --- convert_hf_to_gguf.py | 21 ++++++++++++++++----- src/llama-model.cpp | 5 ----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 88ebf4fa6d4f4..26fcfef20fa1b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6477,11 +6477,22 @@ def set_gguf_parameters(self): # Rope rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": - # Not sure if YARN is correct here, and the factor in the config is only 1 anyway - # but the release claims to scale to 256k, which would be a factor of 8 - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["max_position_embeddings"]) + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) + alpha = rope_scaling.get("alpha", 1000) + base = hparams.get("rope_theta", 10000.0) + dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 + scaled_base = base * (alpha ** (dim / (dim-2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 + self.gguf_writer.add_rope_freq_base(scaled_base) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_rope_scaling_factor(1) + #There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length + self.gguf_writer.add_context_length(256 * 1024) # 256k context length + + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated + assert alpha == 1000 and base == 10000.0 and dim == 128 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \ + "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually" _experts: list[dict[str, Tensor]] | None = None diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d881821d46ae1..4992f81c0e46d 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1511,11 +1511,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); - // TODO: read from gguf - float n_dim = hparams.n_embd_head_k; - float alpha = 1000.0f; // NTK-Aware - hparams.rope_freq_base_train = 10000.0f * std::powf(alpha, n_dim / (n_dim - 2.0f)); - switch (hparams.n_layer) { case 32: type = LLM_TYPE_A13B; break; default: type = LLM_TYPE_UNKNOWN; From 251e78a4d430e5e9585c90cc4e2b159c27df2534 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 30 Jun 2025 17:44:15 +0200 Subject: [PATCH 19/22] Update convert_hf_to_gguf.py --- convert_hf_to_gguf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 26fcfef20fa1b..14f8af995dc88 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -346,8 +346,6 @@ def prepare_tensors(self): data_qtype = gguf.GGMLQuantizationType.BF16 elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q4_0: - data_qtype = gguf.GGMLQuantizationType.Q4_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: data_qtype = gguf.GGMLQuantizationType.TQ1_0 elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: From 06cab8f92de04653cb955af2dac79af06a869a8f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 30 Jun 2025 17:45:19 +0200 Subject: [PATCH 20/22] Apply suggestions from code review --- convert_hf_to_gguf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 14f8af995dc88..c61cbb65d45f4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6630,7 +6630,7 @@ def parse_args() -> argparse.Namespace: help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", ) parser.add_argument( - "--outtype", type=str, choices=["f32", "f16", "bf16", "q4_0", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", + "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( @@ -6762,7 +6762,6 @@ def main() -> None: "f32": gguf.LlamaFileType.ALL_F32, "f16": gguf.LlamaFileType.MOSTLY_F16, "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0, "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, From 2d56a29636935956c394b23023dce00deb7cdbf6 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 30 Jun 2025 18:00:35 +0200 Subject: [PATCH 21/22] fix regression --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 51dd92a10389f..051e37c402752 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -607,6 +607,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) From e5fe089210116acc64d3938884d52f0d0088822f Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 2 Jul 2025 00:52:29 +0200 Subject: [PATCH 22/22] fix style --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 051e37c402752..573f7aec459c1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6465,7 +6465,7 @@ def set_vocab(self): if len(token) == 1: continue merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank) - if len(merged) == 2: #todo this is an assert in Qwen, why? + if len(merged) == 2: # todo this is an assert in Qwen, why? merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged))) # 3. Generate the tokens and toktypes lists @@ -6527,11 +6527,11 @@ def set_gguf_parameters(self): alpha = rope_scaling.get("alpha", 1000) base = hparams.get("rope_theta", 10000.0) dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128 - scaled_base = base * (alpha ** (dim / (dim-2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 + scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251 self.gguf_writer.add_rope_freq_base(scaled_base) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) self.gguf_writer.add_rope_scaling_factor(1) - #There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k + # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length self.gguf_writer.add_context_length(256 * 1024) # 256k context length