Skip to content

Commit cbe9fc8

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # src/llama-vocab.cpp
2 parents ce7aa0d + c81f419 commit cbe9fc8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1470
-27198
lines changed

convert_hf_to_gguf.py

Lines changed: 224 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -840,6 +840,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
840840
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51":
841841
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842842
res = "lfm2"
843+
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
844+
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
845+
res = "kimi-k2"
843846

844847
if res is None:
845848
logger.warning("\n")
@@ -3508,6 +3511,175 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
35083511
return [(new_name, data_torch)]
35093512

35103513

3514+
@ModelBase.register("Plamo2ForCausalLM", "PLaMo2ForCausalLM")
3515+
class Plamo2Model(TextModel):
3516+
model_arch = gguf.MODEL_ARCH.PLAMO2
3517+
3518+
def set_vocab(self):
3519+
# PLaMo 2 uses a custom tokenizer with a .jsonl file
3520+
# We need to handle this specially
3521+
tokenizer_jsonl_path = self.dir_model / "tokenizer.jsonl"
3522+
tokenizer_config_path = self.dir_model / "tokenizer_config.json"
3523+
3524+
if not tokenizer_jsonl_path.is_file():
3525+
raise FileNotFoundError(f"PLaMo 2 tokenizer file not found: {tokenizer_jsonl_path}")
3526+
3527+
# Load tokenizer config
3528+
with open(tokenizer_config_path, 'r', encoding='utf-8') as f:
3529+
tokenizer_config = json.load(f)
3530+
3531+
# Load tokens from JSONL file (actually a list format)
3532+
tokens = []
3533+
scores = []
3534+
toktypes = []
3535+
3536+
with open(tokenizer_jsonl_path, 'r', encoding='utf-8') as f:
3537+
for line_num, line in enumerate(f):
3538+
if line.strip():
3539+
token_data = json.loads(line)
3540+
# Format: [token, score, type, ?, ?, ?, ?]
3541+
token = token_data[0].encode("utf-8")
3542+
score = float(token_data[1])
3543+
token_type_str = token_data[2] if len(token_data) > 2 else "NORMAL"
3544+
3545+
tokens.append(token)
3546+
scores.append(score)
3547+
3548+
# Map token type strings to GGUF token types
3549+
if token_type_str == "UNKNOWN":
3550+
toktypes.append(gguf.TokenType.UNKNOWN)
3551+
elif token_type_str == "CONTROL":
3552+
toktypes.append(gguf.TokenType.CONTROL)
3553+
elif token_type_str == "BYTE":
3554+
toktypes.append(gguf.TokenType.BYTE)
3555+
else:
3556+
# Check for PLaMo-2 special tokens
3557+
token_str = token_data[0]
3558+
if token_str.startswith("<|plamo:") and token_str.endswith("|>"):
3559+
toktypes.append(gguf.TokenType.CONTROL)
3560+
else:
3561+
toktypes.append(gguf.TokenType.NORMAL)
3562+
3563+
vocab_size = self.hparams["vocab_size"]
3564+
if vocab_size > len(tokens):
3565+
pad_count = vocab_size - len(tokens)
3566+
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
3567+
for i in range(1, pad_count + 1):
3568+
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
3569+
scores.append(-1000.0)
3570+
toktypes.append(gguf.TokenType.UNUSED)
3571+
3572+
# Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
3573+
self.gguf_writer.add_tokenizer_model("plamo2")
3574+
self.gguf_writer.add_tokenizer_pre("default")
3575+
self.gguf_writer.add_token_list(tokens)
3576+
self.gguf_writer.add_token_scores(scores)
3577+
self.gguf_writer.add_token_types(toktypes)
3578+
3579+
# Add special tokens from config
3580+
if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] is not None:
3581+
token_id = tokens.index(tokenizer_config["bos_token"].encode("utf-8"))
3582+
self.gguf_writer.add_bos_token_id(token_id)
3583+
if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] is not None:
3584+
token_id = tokens.index(tokenizer_config["eos_token"].encode("utf-8"))
3585+
self.gguf_writer.add_eos_token_id(token_id)
3586+
if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] is not None:
3587+
token_id = tokens.index(tokenizer_config["pad_token"].encode("utf-8"))
3588+
self.gguf_writer.add_pad_token_id(token_id)
3589+
if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] is not None:
3590+
token_id = tokens.index(tokenizer_config["sep_token"].encode("utf-8"))
3591+
self.gguf_writer.add_sep_token_id(token_id)
3592+
if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] is not None:
3593+
token_id = tokens.index(tokenizer_config["unk_token"].encode("utf-8"))
3594+
self.gguf_writer.add_unk_token_id(token_id)
3595+
3596+
# Add <|plamo:op|> as EOT to ensure appropriate end of generation
3597+
self.gguf_writer.add_eot_token_id(4)
3598+
3599+
self.gguf_writer.add_add_space_prefix(False)
3600+
3601+
def set_gguf_parameters(self):
3602+
hparams = self.hparams
3603+
block_count = hparams["num_hidden_layers"]
3604+
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
3605+
3606+
# Which layers are Mamba layers
3607+
# PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
3608+
# This logic matches modeling_plamo.py's is_mamba function
3609+
mamba_step = hparams.get("mamba_step", 2)
3610+
mamba_enabled = hparams.get("mamba_enabled", True)
3611+
mamba_layers = []
3612+
3613+
if mamba_enabled:
3614+
for i in range(block_count):
3615+
if block_count <= (mamba_step // 2):
3616+
# use attention in last layer
3617+
is_mamba = (i != block_count - 1)
3618+
else:
3619+
is_mamba = (i % mamba_step) != (mamba_step // 2)
3620+
if is_mamba:
3621+
mamba_layers.append(0)
3622+
else:
3623+
mamba_layers.append(hparams.get("num_key_value_heads", 4))
3624+
3625+
if mamba_layers:
3626+
self.gguf_writer.add_head_count_kv(mamba_layers)
3627+
3628+
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048))
3629+
self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096))
3630+
self.gguf_writer.add_block_count(block_count)
3631+
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
3632+
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3633+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
3634+
3635+
# Mamba parameters
3636+
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
3637+
self.gguf_writer.add_ssm_conv_kernel(hparams.get("mamba_d_conv", 4))
3638+
self.gguf_writer.add_ssm_time_step_rank(hparams.get("mamba_num_heads", 64))
3639+
intermediate_size = hparams.get("mamba_num_heads", 64) * hparams.get("hidden_size_per_head", 128)
3640+
self.gguf_writer.add_ssm_inner_size(intermediate_size)
3641+
self.gguf_writer.add_ssm_group_count(0)
3642+
3643+
# MLP feed forward parameters (for attention layers)
3644+
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
3645+
self.gguf_writer.add_file_type(self.ftype)
3646+
3647+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
3648+
del bid # unused
3649+
3650+
if name.endswith(".A_log"):
3651+
data_torch = -torch.exp(data_torch)
3652+
elif name.endswith(".dt_bias"):
3653+
name = name.rpartition(".dt_bias")[0] + ".dt_proj.bias"
3654+
elif name.endswith(".dt_norm_weight"):
3655+
name = name.rpartition(".dt_norm_weight")[0] + ".dt_norm.weight"
3656+
elif name.endswith(".B_norm_weight"):
3657+
name = name.rpartition(".B_norm_weight")[0] + ".B_norm.weight"
3658+
elif name.endswith(".C_norm_weight"):
3659+
name = name.rpartition(".C_norm_weight")[0] + ".C_norm.weight"
3660+
elif name.endswith(".k_weight"):
3661+
name = name.rpartition(".k_weight")[0] + ".k.weight"
3662+
elif name.endswith(".q_weight"):
3663+
name = name.rpartition(".q_weight")[0] + ".q.weight"
3664+
elif name.endswith(".conv1d.weight"):
3665+
data_torch = torch.squeeze(data_torch) # remove (, 1, )
3666+
assert data_torch.ndim == 2
3667+
elif name.endswith(".pre_mixer_norm.weight"):
3668+
data_torch += 1.0
3669+
elif name.endswith(".post_mixer_norm.weight"):
3670+
data_torch += 1.0 / 5
3671+
elif name.endswith(".pre_mlp_norm.weight"):
3672+
data_torch += 1.0
3673+
elif name.endswith(".post_mlp_norm.weight"):
3674+
data_torch += 1.0 / (5**1.5)
3675+
elif name.endswith(".norm.weight"):
3676+
data_torch += 1.0
3677+
3678+
new_name = self.map_tensor_name(name)
3679+
3680+
return [(new_name, data_torch)]
3681+
3682+
35113683
@ModelBase.register("CodeShellForCausalLM")
35123684
class CodeShellModel(TextModel):
35133685
model_arch = gguf.MODEL_ARCH.CODESHELL
@@ -5570,7 +5742,58 @@ class DeepseekV2Model(TextModel):
55705742
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
55715743

55725744
def set_vocab(self):
5573-
self._set_vocab_gpt2()
5745+
try:
5746+
self._set_vocab_gpt2()
5747+
return
5748+
except Exception:
5749+
pass
5750+
5751+
from transformers import AutoTokenizer
5752+
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
5753+
tokpre = self.get_vocab_base_pre(tokenizer)
5754+
5755+
if tokpre == "kimi-k2":
5756+
# Build merges list using the approach similar to HunYuanMoE
5757+
merges = []
5758+
vocab = {}
5759+
mergeable_ranks = tokenizer.model._mergeable_ranks
5760+
for token, rank in mergeable_ranks.items():
5761+
vocab[QwenModel.token_bytes_to_string(token)] = rank
5762+
if len(token) == 1:
5763+
continue
5764+
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
5765+
if len(merged) == 2:
5766+
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
5767+
5768+
# Build token list
5769+
vocab_size = self.hparams["vocab_size"]
5770+
special_tokens = tokenizer.special_tokens
5771+
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **special_tokens}.items()}
5772+
tokens: list[str] = []
5773+
toktypes: list[int] = []
5774+
5775+
for i in range(vocab_size):
5776+
if i not in reverse_vocab:
5777+
tokens.append(f"[PAD{i}]")
5778+
toktypes.append(gguf.TokenType.UNUSED)
5779+
else:
5780+
token = reverse_vocab[i]
5781+
tokens.append(token)
5782+
if i in special_tokens.values():
5783+
toktypes.append(gguf.TokenType.CONTROL)
5784+
else:
5785+
toktypes.append(gguf.TokenType.NORMAL)
5786+
5787+
self.gguf_writer.add_tokenizer_model("gpt2")
5788+
self.gguf_writer.add_tokenizer_pre(tokpre)
5789+
self.gguf_writer.add_token_list(tokens)
5790+
self.gguf_writer.add_token_types(toktypes)
5791+
self.gguf_writer.add_token_merges(merges)
5792+
5793+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False)
5794+
special_vocab.add_to_gguf(self.gguf_writer)
5795+
else:
5796+
raise NotImplementedError(f"Deepseek pre-tokenizer {tokpre!r} is not supported yet!")
55745797

55755798
def set_gguf_parameters(self):
55765799

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ class TOKENIZER_TYPE(IntEnum):
146146
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"},
147147
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
148148
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
149+
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
149150
]
150151

151152

0 commit comments

Comments
 (0)