@@ -840,6 +840,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
840
840
if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51" :
841
841
# ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer
842
842
res = "lfm2"
843
+ if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890" :
844
+ # ref: https://huggingface.co/moonshotai/Kimi-K2-Base
845
+ res = "kimi-k2"
843
846
844
847
if res is None :
845
848
logger .warning ("\n " )
@@ -3508,6 +3511,175 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
3508
3511
return [(new_name , data_torch )]
3509
3512
3510
3513
3514
+ @ModelBase .register ("Plamo2ForCausalLM" , "PLaMo2ForCausalLM" )
3515
+ class Plamo2Model (TextModel ):
3516
+ model_arch = gguf .MODEL_ARCH .PLAMO2
3517
+
3518
+ def set_vocab (self ):
3519
+ # PLaMo 2 uses a custom tokenizer with a .jsonl file
3520
+ # We need to handle this specially
3521
+ tokenizer_jsonl_path = self .dir_model / "tokenizer.jsonl"
3522
+ tokenizer_config_path = self .dir_model / "tokenizer_config.json"
3523
+
3524
+ if not tokenizer_jsonl_path .is_file ():
3525
+ raise FileNotFoundError (f"PLaMo 2 tokenizer file not found: { tokenizer_jsonl_path } " )
3526
+
3527
+ # Load tokenizer config
3528
+ with open (tokenizer_config_path , 'r' , encoding = 'utf-8' ) as f :
3529
+ tokenizer_config = json .load (f )
3530
+
3531
+ # Load tokens from JSONL file (actually a list format)
3532
+ tokens = []
3533
+ scores = []
3534
+ toktypes = []
3535
+
3536
+ with open (tokenizer_jsonl_path , 'r' , encoding = 'utf-8' ) as f :
3537
+ for line_num , line in enumerate (f ):
3538
+ if line .strip ():
3539
+ token_data = json .loads (line )
3540
+ # Format: [token, score, type, ?, ?, ?, ?]
3541
+ token = token_data [0 ].encode ("utf-8" )
3542
+ score = float (token_data [1 ])
3543
+ token_type_str = token_data [2 ] if len (token_data ) > 2 else "NORMAL"
3544
+
3545
+ tokens .append (token )
3546
+ scores .append (score )
3547
+
3548
+ # Map token type strings to GGUF token types
3549
+ if token_type_str == "UNKNOWN" :
3550
+ toktypes .append (gguf .TokenType .UNKNOWN )
3551
+ elif token_type_str == "CONTROL" :
3552
+ toktypes .append (gguf .TokenType .CONTROL )
3553
+ elif token_type_str == "BYTE" :
3554
+ toktypes .append (gguf .TokenType .BYTE )
3555
+ else :
3556
+ # Check for PLaMo-2 special tokens
3557
+ token_str = token_data [0 ]
3558
+ if token_str .startswith ("<|plamo:" ) and token_str .endswith ("|>" ):
3559
+ toktypes .append (gguf .TokenType .CONTROL )
3560
+ else :
3561
+ toktypes .append (gguf .TokenType .NORMAL )
3562
+
3563
+ vocab_size = self .hparams ["vocab_size" ]
3564
+ if vocab_size > len (tokens ):
3565
+ pad_count = vocab_size - len (tokens )
3566
+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
3567
+ for i in range (1 , pad_count + 1 ):
3568
+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
3569
+ scores .append (- 1000.0 )
3570
+ toktypes .append (gguf .TokenType .UNUSED )
3571
+
3572
+ # Use "plamo2" tokenizer type for PLaMo-2's custom Aho-Corasick tokenizer
3573
+ self .gguf_writer .add_tokenizer_model ("plamo2" )
3574
+ self .gguf_writer .add_tokenizer_pre ("default" )
3575
+ self .gguf_writer .add_token_list (tokens )
3576
+ self .gguf_writer .add_token_scores (scores )
3577
+ self .gguf_writer .add_token_types (toktypes )
3578
+
3579
+ # Add special tokens from config
3580
+ if "bos_token" in tokenizer_config and tokenizer_config ["bos_token" ] is not None :
3581
+ token_id = tokens .index (tokenizer_config ["bos_token" ].encode ("utf-8" ))
3582
+ self .gguf_writer .add_bos_token_id (token_id )
3583
+ if "eos_token" in tokenizer_config and tokenizer_config ["eos_token" ] is not None :
3584
+ token_id = tokens .index (tokenizer_config ["eos_token" ].encode ("utf-8" ))
3585
+ self .gguf_writer .add_eos_token_id (token_id )
3586
+ if "pad_token" in tokenizer_config and tokenizer_config ["pad_token" ] is not None :
3587
+ token_id = tokens .index (tokenizer_config ["pad_token" ].encode ("utf-8" ))
3588
+ self .gguf_writer .add_pad_token_id (token_id )
3589
+ if "sep_token" in tokenizer_config and tokenizer_config ["sep_token" ] is not None :
3590
+ token_id = tokens .index (tokenizer_config ["sep_token" ].encode ("utf-8" ))
3591
+ self .gguf_writer .add_sep_token_id (token_id )
3592
+ if "unk_token" in tokenizer_config and tokenizer_config ["unk_token" ] is not None :
3593
+ token_id = tokens .index (tokenizer_config ["unk_token" ].encode ("utf-8" ))
3594
+ self .gguf_writer .add_unk_token_id (token_id )
3595
+
3596
+ # Add <|plamo:op|> as EOT to ensure appropriate end of generation
3597
+ self .gguf_writer .add_eot_token_id (4 )
3598
+
3599
+ self .gguf_writer .add_add_space_prefix (False )
3600
+
3601
+ def set_gguf_parameters (self ):
3602
+ hparams = self .hparams
3603
+ block_count = hparams ["num_hidden_layers" ]
3604
+ self .gguf_writer .add_vocab_size (self .hparams ["vocab_size" ])
3605
+
3606
+ # Which layers are Mamba layers
3607
+ # PLaMo 2 uses mamba_step to indicate the pattern (e.g., 2 means every other layer)
3608
+ # This logic matches modeling_plamo.py's is_mamba function
3609
+ mamba_step = hparams .get ("mamba_step" , 2 )
3610
+ mamba_enabled = hparams .get ("mamba_enabled" , True )
3611
+ mamba_layers = []
3612
+
3613
+ if mamba_enabled :
3614
+ for i in range (block_count ):
3615
+ if block_count <= (mamba_step // 2 ):
3616
+ # use attention in last layer
3617
+ is_mamba = (i != block_count - 1 )
3618
+ else :
3619
+ is_mamba = (i % mamba_step ) != (mamba_step // 2 )
3620
+ if is_mamba :
3621
+ mamba_layers .append (0 )
3622
+ else :
3623
+ mamba_layers .append (hparams .get ("num_key_value_heads" , 4 ))
3624
+
3625
+ if mamba_layers :
3626
+ self .gguf_writer .add_head_count_kv (mamba_layers )
3627
+
3628
+ self .gguf_writer .add_context_length (hparams .get ("max_position_embeddings" , 2048 ))
3629
+ self .gguf_writer .add_embedding_length (hparams .get ("hidden_size" , 4096 ))
3630
+ self .gguf_writer .add_block_count (block_count )
3631
+ self .gguf_writer .add_head_count (hparams .get ("num_attention_heads" , 32 ))
3632
+ self .gguf_writer .add_layer_norm_rms_eps (hparams .get ("rms_norm_eps" , 1e-06 ))
3633
+ self .gguf_writer .add_rope_freq_base (hparams .get ("rope_theta" , 1000000.0 ))
3634
+
3635
+ # Mamba parameters
3636
+ self .gguf_writer .add_ssm_state_size (hparams .get ("mamba_d_state" , 64 ))
3637
+ self .gguf_writer .add_ssm_conv_kernel (hparams .get ("mamba_d_conv" , 4 ))
3638
+ self .gguf_writer .add_ssm_time_step_rank (hparams .get ("mamba_num_heads" , 64 ))
3639
+ intermediate_size = hparams .get ("mamba_num_heads" , 64 ) * hparams .get ("hidden_size_per_head" , 128 )
3640
+ self .gguf_writer .add_ssm_inner_size (intermediate_size )
3641
+ self .gguf_writer .add_ssm_group_count (0 )
3642
+
3643
+ # MLP feed forward parameters (for attention layers)
3644
+ self .gguf_writer .add_feed_forward_length (hparams .get ("intermediate_size" , 16384 ))
3645
+ self .gguf_writer .add_file_type (self .ftype )
3646
+
3647
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
3648
+ del bid # unused
3649
+
3650
+ if name .endswith (".A_log" ):
3651
+ data_torch = - torch .exp (data_torch )
3652
+ elif name .endswith (".dt_bias" ):
3653
+ name = name .rpartition (".dt_bias" )[0 ] + ".dt_proj.bias"
3654
+ elif name .endswith (".dt_norm_weight" ):
3655
+ name = name .rpartition (".dt_norm_weight" )[0 ] + ".dt_norm.weight"
3656
+ elif name .endswith (".B_norm_weight" ):
3657
+ name = name .rpartition (".B_norm_weight" )[0 ] + ".B_norm.weight"
3658
+ elif name .endswith (".C_norm_weight" ):
3659
+ name = name .rpartition (".C_norm_weight" )[0 ] + ".C_norm.weight"
3660
+ elif name .endswith (".k_weight" ):
3661
+ name = name .rpartition (".k_weight" )[0 ] + ".k.weight"
3662
+ elif name .endswith (".q_weight" ):
3663
+ name = name .rpartition (".q_weight" )[0 ] + ".q.weight"
3664
+ elif name .endswith (".conv1d.weight" ):
3665
+ data_torch = torch .squeeze (data_torch ) # remove (, 1, )
3666
+ assert data_torch .ndim == 2
3667
+ elif name .endswith (".pre_mixer_norm.weight" ):
3668
+ data_torch += 1.0
3669
+ elif name .endswith (".post_mixer_norm.weight" ):
3670
+ data_torch += 1.0 / 5
3671
+ elif name .endswith (".pre_mlp_norm.weight" ):
3672
+ data_torch += 1.0
3673
+ elif name .endswith (".post_mlp_norm.weight" ):
3674
+ data_torch += 1.0 / (5 ** 1.5 )
3675
+ elif name .endswith (".norm.weight" ):
3676
+ data_torch += 1.0
3677
+
3678
+ new_name = self .map_tensor_name (name )
3679
+
3680
+ return [(new_name , data_torch )]
3681
+
3682
+
3511
3683
@ModelBase .register ("CodeShellForCausalLM" )
3512
3684
class CodeShellModel (TextModel ):
3513
3685
model_arch = gguf .MODEL_ARCH .CODESHELL
@@ -5570,7 +5742,58 @@ class DeepseekV2Model(TextModel):
5570
5742
model_arch = gguf .MODEL_ARCH .DEEPSEEK2
5571
5743
5572
5744
def set_vocab (self ):
5573
- self ._set_vocab_gpt2 ()
5745
+ try :
5746
+ self ._set_vocab_gpt2 ()
5747
+ return
5748
+ except Exception :
5749
+ pass
5750
+
5751
+ from transformers import AutoTokenizer
5752
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
5753
+ tokpre = self .get_vocab_base_pre (tokenizer )
5754
+
5755
+ if tokpre == "kimi-k2" :
5756
+ # Build merges list using the approach similar to HunYuanMoE
5757
+ merges = []
5758
+ vocab = {}
5759
+ mergeable_ranks = tokenizer .model ._mergeable_ranks
5760
+ for token , rank in mergeable_ranks .items ():
5761
+ vocab [QwenModel .token_bytes_to_string (token )] = rank
5762
+ if len (token ) == 1 :
5763
+ continue
5764
+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
5765
+ if len (merged ) == 2 :
5766
+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
5767
+
5768
+ # Build token list
5769
+ vocab_size = self .hparams ["vocab_size" ]
5770
+ special_tokens = tokenizer .special_tokens
5771
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
5772
+ tokens : list [str ] = []
5773
+ toktypes : list [int ] = []
5774
+
5775
+ for i in range (vocab_size ):
5776
+ if i not in reverse_vocab :
5777
+ tokens .append (f"[PAD{ i } ]" )
5778
+ toktypes .append (gguf .TokenType .UNUSED )
5779
+ else :
5780
+ token = reverse_vocab [i ]
5781
+ tokens .append (token )
5782
+ if i in special_tokens .values ():
5783
+ toktypes .append (gguf .TokenType .CONTROL )
5784
+ else :
5785
+ toktypes .append (gguf .TokenType .NORMAL )
5786
+
5787
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
5788
+ self .gguf_writer .add_tokenizer_pre (tokpre )
5789
+ self .gguf_writer .add_token_list (tokens )
5790
+ self .gguf_writer .add_token_types (toktypes )
5791
+ self .gguf_writer .add_token_merges (merges )
5792
+
5793
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
5794
+ special_vocab .add_to_gguf (self .gguf_writer )
5795
+ else :
5796
+ raise NotImplementedError (f"Deepseek pre-tokenizer { tokpre !r} is not supported yet!" )
5574
5797
5575
5798
def set_gguf_parameters (self ):
5576
5799
0 commit comments