Skip to content

Commit c128138

Browse files
support hunyuan_v1_dense 7b
Signed-off-by: stevenkuang <stevenkuang@tencent.com>
1 parent 55c509d commit c128138

File tree

8 files changed

+346
-0
lines changed

8 files changed

+346
-0
lines changed

convert_hf_to_gguf.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -819,6 +819,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
819819
if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664":
820820
# ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
821821
res = "hunyuan"
822+
if chkhsh == "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6":
823+
# ref: # TODO: update ref
824+
res = "hunyuan-v1-dense"
822825
if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf":
823826
# ref: https://huggingface.co/skt/A.X-4.0
824827
res = "a.x-4.0"
@@ -7070,6 +7073,105 @@ def prepare_tensors(self):
70707073
raise ValueError(f"Unprocessed experts: {experts}")
70717074

70727075

7076+
@ModelBase.register("HunYuanDenseV1ForCausalLM")
7077+
class HunYuanModel(TextModel):
7078+
model_arch = gguf.MODEL_ARCH.HUNYUAN_V1_DENSE
7079+
7080+
def __init__(self, *args, **kwargs):
7081+
super().__init__(*args, **kwargs)
7082+
# For handling tied embeddings
7083+
self._tok_embd = None
7084+
7085+
def set_vocab(self):
7086+
from transformers import AutoTokenizer
7087+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
7088+
7089+
# 1. Get the pre-tokenizer identifier hash
7090+
tokpre = self.get_vocab_base_pre(tokenizer)
7091+
7092+
# 2. Reverse-engineer the merges list from mergeable_ranks
7093+
merges = []
7094+
vocab = {}
7095+
mergeable_ranks = tokenizer.mergeable_ranks
7096+
for token, rank in mergeable_ranks.items():
7097+
vocab[QwenModel.token_bytes_to_string(token)] = rank
7098+
if len(token) == 1:
7099+
continue
7100+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
7101+
if len(merged) == 2: # todo this is an assert in Qwen, why?
7102+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
7103+
7104+
# 3. Generate the tokens and toktypes lists
7105+
vocab_size = self.hparams["vocab_size"]
7106+
assert tokenizer.vocab_size == vocab_size
7107+
special_tokens = tokenizer.special_tokens
7108+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
7109+
tokens: list[str] = []
7110+
toktypes: list[int] = []
7111+
for i in range(vocab_size):
7112+
if i not in reverse_vocab:
7113+
tokens.append(f"[PAD{i}]")
7114+
toktypes.append(gguf.TokenType.UNUSED)
7115+
else:
7116+
token = reverse_vocab[i]
7117+
tokens.append(token)
7118+
if i in special_tokens.values():
7119+
toktypes.append(gguf.TokenType.CONTROL)
7120+
else:
7121+
toktypes.append(gguf.TokenType.NORMAL)
7122+
7123+
# 4. Write all vocab-related fields to the GGUF writer
7124+
self.gguf_writer.add_tokenizer_model("gpt2")
7125+
self.gguf_writer.add_tokenizer_pre(tokpre)
7126+
self.gguf_writer.add_token_list(tokens)
7127+
self.gguf_writer.add_token_types(toktypes)
7128+
self.gguf_writer.add_token_merges(merges)
7129+
7130+
# 5. Add special tokens and chat templates
7131+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
7132+
special_vocab.add_to_gguf(self.gguf_writer)
7133+
# FIX for BOS token: Overwrite incorrect id read from config.json
7134+
self.gguf_writer.add_bos_token_id(127959) # <|bos|>
7135+
7136+
def set_gguf_parameters(self):
7137+
super().set_gguf_parameters()
7138+
hparams = self.hparams
7139+
7140+
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
7141+
7142+
# Rope
7143+
rope_scaling = hparams.get("rope_scaling", {})
7144+
if rope_scaling.get("type") == "dynamic":
7145+
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
7146+
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
7147+
alpha = rope_scaling.get("alpha", 50)
7148+
base = hparams.get("rope_theta", 10000.0)
7149+
dim = (hparams["hidden_size"] // hparams["num_attention_heads"]) # 128
7150+
scaled_base = base * (alpha ** (dim / (dim - 2))) # 10000 * (50 ** (128 / 126)) = 532032.0339
7151+
self.gguf_writer.add_rope_freq_base(scaled_base)
7152+
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
7153+
self.gguf_writer.add_rope_scaling_factor(1)
7154+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
7155+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
7156+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
7157+
7158+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
7159+
assert alpha == 50 and base == 10000.0 and dim in [96, 128] and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
7160+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
7161+
7162+
_experts: list[dict[str, Tensor]] | None = None
7163+
7164+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
7165+
if name == "model.embed_tokens.weight":
7166+
self._tok_embd = data_torch.clone()
7167+
7168+
if name == "lm_head.weight":
7169+
if self.hparams.get("tie_word_embeddings", False):
7170+
logger.info("Skipping tied output layer 'lm_head.weight'")
7171+
return []
7172+
7173+
return [(self.map_tensor_name(name), data_torch)]
7174+
70737175
@ModelBase.register("SmolLM3ForCausalLM")
70747176
class SmolLM3Model(LlamaModel):
70757177
model_arch = gguf.MODEL_ARCH.SMOLLM3

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ class TOKENIZER_TYPE(IntEnum):
141141
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
142142
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
143143
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
144+
{"name": "hunyuan-v1-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "", "chkhsh": ""}, # TODO: update hf model and hash
144145
# falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes
145146
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"},
146147
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},

gguf-py/gguf/constants.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@ class MODEL_ARCH(IntEnum):
364364
ARCEE = auto()
365365
ERNIE4_5 = auto()
366366
HUNYUAN_MOE = auto()
367+
HUNYUAN_V1_DENSE = auto()
367368
SMOLLM3 = auto()
368369
LFM2 = auto()
369370

@@ -679,6 +680,7 @@ class MODEL_TENSOR(IntEnum):
679680
MODEL_ARCH.ERNIE4_5: "ernie4_5",
680681
MODEL_ARCH.FALCON_H1: "falcon-h1",
681682
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
683+
MODEL_ARCH.HUNYUAN_V1_DENSE: "hunyuan-v1-dense",
682684
MODEL_ARCH.SMOLLM3: "smollm3",
683685
MODEL_ARCH.LFM2: "lfm2",
684686
}
@@ -2351,6 +2353,22 @@ class MODEL_TENSOR(IntEnum):
23512353
MODEL_TENSOR.FFN_DOWN_SHEXP,
23522354
MODEL_TENSOR.FFN_UP_SHEXP,
23532355
],
2356+
MODEL_ARCH.HUNYUAN_V1_DENSE: [
2357+
MODEL_TENSOR.TOKEN_EMBD,
2358+
MODEL_TENSOR.OUTPUT_NORM,
2359+
MODEL_TENSOR.OUTPUT,
2360+
MODEL_TENSOR.ATTN_NORM,
2361+
MODEL_TENSOR.ATTN_Q,
2362+
MODEL_TENSOR.ATTN_Q_NORM,
2363+
MODEL_TENSOR.ATTN_K,
2364+
MODEL_TENSOR.ATTN_K_NORM,
2365+
MODEL_TENSOR.ATTN_V,
2366+
MODEL_TENSOR.ATTN_OUT,
2367+
MODEL_TENSOR.FFN_NORM,
2368+
MODEL_TENSOR.FFN_GATE,
2369+
MODEL_TENSOR.FFN_DOWN,
2370+
MODEL_TENSOR.FFN_UP,
2371+
],
23542372
MODEL_ARCH.SMOLLM3: [
23552373
MODEL_TENSOR.TOKEN_EMBD,
23562374
MODEL_TENSOR.OUTPUT_NORM,

src/llama-arch.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8282
{ LLM_ARCH_ARCEE, "arcee" },
8383
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
8484
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
85+
{ LLM_ARCH_HUNYUAN_V1_DENSE, "hunyuan-v1-dense" },
8586
{ LLM_ARCH_SMOLLM3, "smollm3" },
8687
{ LLM_ARCH_LFM2, "lfm2" },
8788
{ LLM_ARCH_UNKNOWN, "(unknown)" },
@@ -1816,6 +1817,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
18161817
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
18171818
},
18181819
},
1820+
{
1821+
LLM_ARCH_HUNYUAN_V1_DENSE,
1822+
{
1823+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1824+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1825+
{ LLM_TENSOR_OUTPUT, "output" },
1826+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1827+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
1828+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1829+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
1830+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1831+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
1832+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1833+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1834+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1835+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1836+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1837+
1838+
},
1839+
},
18191840
{
18201841
LLM_ARCH_SMOLLM3,
18211842
{

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ enum llm_arch {
8686
LLM_ARCH_ARCEE,
8787
LLM_ARCH_ERNIE4_5,
8888
LLM_ARCH_HUNYUAN_MOE,
89+
LLM_ARCH_HUNYUAN_V1_DENSE,
8990
LLM_ARCH_SMOLLM3,
9091
LLM_ARCH_LFM2,
9192
LLM_ARCH_UNKNOWN,

src/llama-chat.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
6565
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
6666
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
6767
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
68+
{ "hunyuan-v1-dense", LLM_CHAT_TEMPLATE_HUNYUAN_V1_DENSE },
6869
};
6970

7071
llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -680,6 +681,18 @@ int32_t llm_chat_apply_template(
680681
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
681682
}
682683
}
684+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_V1_DENSE) {
685+
//// Todo: add comment
686+
for (auto message : chat) {
687+
std::string role(message->role);
688+
if (role == "system") {
689+
ss << "<|startoftext|>" << message->content << "<|extra_4|>";
690+
} else if (role == "assistant") {
691+
ss << "<|startoftext|>" << message->content << "<|eos|>";
692+
} else {
693+
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
694+
}
695+
}
683696
} else {
684697
// template not supported
685698
return -1;

src/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ enum llm_chat_template {
4545
LLM_CHAT_TEMPLATE_SMOLVLM,
4646
LLM_CHAT_TEMPLATE_DOTS1,
4747
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
48+
LLM_CHAT_TEMPLATE_HUNYUAN_V1_DENSE,
4849
LLM_CHAT_TEMPLATE_UNKNOWN,
4950
};
5051

0 commit comments

Comments
 (0)