@@ -815,6 +815,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
815
815
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35" :
816
816
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
817
817
res = "minerva-7b"
818
+ if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664" :
819
+ # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct
820
+ res = "hunyuan"
818
821
819
822
if res is None :
820
823
logger .warning ("\n " )
@@ -6652,6 +6655,160 @@ def set_gguf_parameters(self):
6652
6655
super ().set_gguf_parameters ()
6653
6656
self .gguf_writer .add_audio_stack_factor (self .global_config ["stack_factor" ])
6654
6657
6658
+
6659
+ @ModelBase .register ("HunYuanMoEV1ForCausalLM" )
6660
+ class HunYuanMoEModel (TextModel ):
6661
+ model_arch = gguf .MODEL_ARCH .HUNYUAN_MOE
6662
+
6663
+ def __init__ (self , * args , ** kwargs ):
6664
+ super ().__init__ (* args , ** kwargs )
6665
+ # For handling tied embeddings
6666
+ self ._tok_embd = None
6667
+
6668
+ def set_vocab (self ):
6669
+ from transformers import AutoTokenizer
6670
+ tokenizer = AutoTokenizer .from_pretrained (self .dir_model , trust_remote_code = True )
6671
+
6672
+ # 1. Get the pre-tokenizer identifier hash
6673
+ tokpre = self .get_vocab_base_pre (tokenizer )
6674
+
6675
+ # 2. Reverse-engineer the merges list from mergeable_ranks
6676
+ merges = []
6677
+ vocab = {}
6678
+ mergeable_ranks = tokenizer .mergeable_ranks
6679
+ for token , rank in mergeable_ranks .items ():
6680
+ vocab [QwenModel .token_bytes_to_string (token )] = rank
6681
+ if len (token ) == 1 :
6682
+ continue
6683
+ merged = QwenModel .bpe (mergeable_ranks , token , max_rank = rank )
6684
+ if len (merged ) == 2 : # todo this is an assert in Qwen, why?
6685
+ merges .append (' ' .join (map (QwenModel .token_bytes_to_string , merged )))
6686
+
6687
+ # 3. Generate the tokens and toktypes lists
6688
+ vocab_size = self .hparams ["vocab_size" ]
6689
+ assert tokenizer .vocab_size == vocab_size
6690
+ special_tokens = tokenizer .special_tokens
6691
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in {** vocab , ** special_tokens }.items ()}
6692
+ tokens : list [str ] = []
6693
+ toktypes : list [int ] = []
6694
+ for i in range (vocab_size ):
6695
+ if i not in reverse_vocab :
6696
+ tokens .append (f"[PAD{ i } ]" )
6697
+ toktypes .append (gguf .TokenType .UNUSED )
6698
+ else :
6699
+ token = reverse_vocab [i ]
6700
+ tokens .append (token )
6701
+ if i in special_tokens .values ():
6702
+ toktypes .append (gguf .TokenType .CONTROL )
6703
+ else :
6704
+ toktypes .append (gguf .TokenType .NORMAL )
6705
+
6706
+ # 4. Write all vocab-related fields to the GGUF writer
6707
+ self .gguf_writer .add_tokenizer_model ("gpt2" )
6708
+ self .gguf_writer .add_tokenizer_pre (tokpre )
6709
+ self .gguf_writer .add_token_list (tokens )
6710
+ self .gguf_writer .add_token_types (toktypes )
6711
+ self .gguf_writer .add_token_merges (merges )
6712
+
6713
+ # 5. Add special tokens and chat templates
6714
+ special_vocab = gguf .SpecialVocab (self .dir_model , load_merges = False )
6715
+ special_vocab .add_to_gguf (self .gguf_writer )
6716
+ # FIX for BOS token: Overwrite incorrect id read from config.json
6717
+ self .gguf_writer .add_bos_token_id (127959 ) # <|bos|>
6718
+
6719
+ def set_gguf_parameters (self ):
6720
+ super ().set_gguf_parameters ()
6721
+ hparams = self .hparams
6722
+
6723
+ self .gguf_writer .add_expert_count (hparams ["num_experts" ])
6724
+ self .gguf_writer .add_expert_shared_feed_forward_length (hparams ["intermediate_size" ])
6725
+
6726
+ moe_intermediate_size = hparams ["moe_intermediate_size" ]
6727
+ assert all (n == moe_intermediate_size [0 ] for n in moe_intermediate_size )
6728
+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size [0 ])
6729
+
6730
+ moe_topk = hparams ["moe_topk" ]
6731
+ assert all (topk == moe_topk [0 ] for topk in moe_topk )
6732
+ self .gguf_writer .add_expert_used_count (moe_topk [0 ])
6733
+
6734
+ moe_shared_expert = hparams ["num_shared_expert" ]
6735
+ assert all (n == moe_shared_expert [0 ] for n in moe_shared_expert )
6736
+ self .gguf_writer .add_expert_shared_count (moe_shared_expert [0 ])
6737
+
6738
+ # Rope
6739
+ rope_scaling = hparams .get ("rope_scaling" , {})
6740
+ if rope_scaling .get ("type" ) == "dynamic" :
6741
+ # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
6742
+ # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
6743
+ alpha = rope_scaling .get ("alpha" , 1000 )
6744
+ base = hparams .get ("rope_theta" , 10000.0 )
6745
+ dim = (hparams ["hidden_size" ] // hparams ["num_attention_heads" ]) # 128
6746
+ scaled_base = base * (alpha ** (dim / (dim - 2 ))) # 10000 * (1000 ** (128 / 126)) = 11158839.9251
6747
+ self .gguf_writer .add_rope_freq_base (scaled_base )
6748
+ self .gguf_writer .add_rope_scaling_type (gguf .RopeScalingType .NONE )
6749
+ self .gguf_writer .add_rope_scaling_factor (1 )
6750
+ # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
6751
+ self .gguf_writer .add_rope_scaling_orig_ctx_len (256 * 1024 ) # 256k context length
6752
+ self .gguf_writer .add_context_length (256 * 1024 ) # 256k context length
6753
+
6754
+ # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
6755
+ assert alpha == 1000 and base == 10000.0 and dim == 128 and self .hparams ["max_position_embeddings" ] in [32 * 1024 , 256 * 1024 ] , \
6756
+ "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
6757
+
6758
+ _experts : list [dict [str , Tensor ]] | None = None
6759
+
6760
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
6761
+ if name == "model.embed_tokens.weight" :
6762
+ self ._tok_embd = data_torch .clone ()
6763
+
6764
+ if name == "lm_head.weight" :
6765
+ if self .hparams .get ("tie_word_embeddings" , False ):
6766
+ logger .info ("Skipping tied output layer 'lm_head.weight'" )
6767
+ return []
6768
+
6769
+ if name .find ("mlp.experts" ) != - 1 :
6770
+ n_experts = self .hparams ["num_experts" ]
6771
+ assert bid is not None
6772
+
6773
+ if self ._experts is None :
6774
+ self ._experts = [{} for _ in range (self .block_count )]
6775
+
6776
+ self ._experts [bid ][name ] = data_torch
6777
+
6778
+ if len (self ._experts [bid ]) >= n_experts * 3 :
6779
+ # merge the experts into a single 3d tensor
6780
+ tensors : list [tuple [str , Tensor ]] = []
6781
+ for w_name in ["down_proj" , "gate_proj" , "up_proj" ]:
6782
+ datas : list [Tensor ] = []
6783
+
6784
+ for xid in range (n_experts ):
6785
+ ename = f"model.layers.{ bid } .mlp.experts.{ xid } .{ w_name } .weight"
6786
+ datas .append (self ._experts [bid ][ename ])
6787
+ del self ._experts [bid ][ename ]
6788
+
6789
+ data_torch = torch .stack (datas , dim = 0 )
6790
+ merged_name = f"model.layers.{ bid } .mlp.experts.{ w_name } .weight"
6791
+ new_name = self .map_tensor_name (merged_name )
6792
+ tensors .append ((new_name , data_torch ))
6793
+
6794
+ return tensors
6795
+ else :
6796
+ return []
6797
+
6798
+ return [(self .map_tensor_name (name ), data_torch )]
6799
+
6800
+ def prepare_tensors (self ):
6801
+ super ().prepare_tensors ()
6802
+ if self ._experts is not None :
6803
+ experts = [k for d in self ._experts for k in d .keys ()]
6804
+ if len (experts ) > 0 :
6805
+ raise ValueError (f"Unprocessed experts: { experts } " )
6806
+
6807
+
6808
+ @ModelBase .register ("SmolLM3ForCausalLM" )
6809
+ class SmolLM3Model (LlamaModel ):
6810
+ model_arch = gguf .MODEL_ARCH .SMOLLM3
6811
+
6655
6812
###### CONVERSION LOGIC ######
6656
6813
6657
6814
0 commit comments