@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
80
80
if not self .is_safetensors :
81
81
self .part_names = Model .get_model_part_names (self .dir_model , "pytorch_model" , ".bin" )
82
82
self .hparams = Model .load_hparams (self .dir_model )
83
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
83
+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
84
84
self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
85
85
self .tensor_names = None
86
86
if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2716,6 +2716,167 @@ def write_tensors(self):
2716
2716
raise ValueError (f"Unprocessed experts: { experts } " )
2717
2717
2718
2718
2719
+ @Model .register ("ChatGLMModel" )
2720
+ class ChatGLMModel (Model ):
2721
+ model_arch = gguf .MODEL_ARCH .CHATGLM
2722
+
2723
+ def set_vocab (self ):
2724
+ dir_model = self .dir_model
2725
+ hparams = self .hparams
2726
+ tokens : list [bytearray ] = []
2727
+ toktypes : list [int ] = []
2728
+ scores : list [float ] = []
2729
+
2730
+ from transformers import AutoTokenizer
2731
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
2732
+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
2733
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
2734
+
2735
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .get_vocab ().items ()}
2736
+
2737
+ for token_id in range (vocab_size ):
2738
+ piece = tokenizer ._convert_id_to_token (token_id )
2739
+ if token_id == 0 :
2740
+ piece = "<unk>"
2741
+ elif token_id == 1 :
2742
+ piece = "<bos>"
2743
+ elif token_id == 2 :
2744
+ piece = "<eos>"
2745
+
2746
+ text = piece .encode ("utf-8" )
2747
+ score = 0.0
2748
+ if len (piece ) != 0 and token_id < 64789 :
2749
+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
2750
+
2751
+ if len (piece ) == 0 :
2752
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
2753
+
2754
+ if token_id >= 64789 :
2755
+ toktype = SentencePieceTokenTypes .UNKNOWN
2756
+ tokens .append (text )
2757
+ scores .append (score )
2758
+ toktypes .append (toktype )
2759
+ continue
2760
+
2761
+ toktype = SentencePieceTokenTypes .NORMAL
2762
+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
2763
+ toktype = SentencePieceTokenTypes .UNKNOWN
2764
+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
2765
+ toktype = SentencePieceTokenTypes .CONTROL
2766
+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
2767
+ toktype = SentencePieceTokenTypes .UNUSED
2768
+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
2769
+ toktype = SentencePieceTokenTypes .BYTE
2770
+
2771
+ tokens .append (text )
2772
+ scores .append (score )
2773
+ toktypes .append (toktype )
2774
+
2775
+ self .gguf_writer .add_tokenizer_model ("llama" )
2776
+ self .gguf_writer .add_token_list (tokens )
2777
+ self .gguf_writer .add_token_scores (scores )
2778
+ self .gguf_writer .add_token_types (toktypes )
2779
+
2780
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2781
+ special_vocab .add_to_gguf (self .gguf_writer )
2782
+
2783
+ def set_gguf_parameters (self ):
2784
+ self .gguf_writer .add_name ("ChatGLM-6b-chat" )
2785
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2786
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2787
+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
2788
+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
2789
+ self .gguf_writer .add_embedding_length (n_embed )
2790
+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
2791
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2792
+ self .gguf_writer .add_head_count (n_head )
2793
+ self .gguf_writer .add_head_count_kv (n_head_kv )
2794
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
2795
+ self .gguf_writer .add_file_type (self .ftype )
2796
+ self .gguf_writer .add_rope_dimension_count (64 )
2797
+ self .gguf_writer .add_add_bos_token (False )
2798
+
2799
+ def write_tensors (self ):
2800
+ block_count = self .hparams ["num_layers" ]
2801
+ tensors = dict (self .get_tensors ())
2802
+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
2803
+ has_lm_head = True
2804
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2805
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2806
+
2807
+ for name , data_torch in tensors .items ():
2808
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
2809
+ continue
2810
+
2811
+ if "lm_head.weight" not in tensors .keys () and "output.weight" not in tensors .keys ():
2812
+ has_lm_head = False
2813
+
2814
+ name = re .sub (r'transformer\.' , '' , name )
2815
+
2816
+ old_dtype = data_torch .dtype
2817
+
2818
+ # convert any unsupported data types to float32
2819
+ if data_torch .dtype not in (torch .float16 , torch .float32 ):
2820
+ data_torch = data_torch .to (torch .float32 )
2821
+
2822
+ data = data_torch .squeeze ().numpy ()
2823
+
2824
+ if re .match (r"h\.\d+\.self_attention\.query_key_value\.weight" , name ):
2825
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
2826
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
2827
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
2828
+ qkv_weights = data .reshape ((n_head , 3 , n_embed // n_head , n_embed ))
2829
+ data = np .concatenate (
2830
+ (
2831
+ qkv_weights [:, 0 , :, :].reshape ((- 1 , n_embed )),
2832
+ qkv_weights [:, 1 , :, :].reshape ((- 1 , n_embed )),
2833
+ qkv_weights [:, 2 , :, :].reshape ((- 1 , n_embed )),
2834
+ ),
2835
+ axis = 0 ,
2836
+ )
2837
+ print ("re-format attention.linear_qkv.weight" )
2838
+ elif re .match (r"h\.\d+\.self_attention\.query_key_value\.bias" , name ):
2839
+ qkv_bias = data .reshape ((n_head , 3 , n_embed // n_head ))
2840
+ data = np .concatenate (
2841
+ (
2842
+ qkv_bias [:, 0 , :].reshape ((n_embed ,)),
2843
+ qkv_bias [:, 1 , :].reshape ((n_embed ,)),
2844
+ qkv_bias [:, 2 , :].reshape ((n_embed ,)),
2845
+ ),
2846
+ axis = 0 ,
2847
+ )
2848
+ print ("re-format attention.linear_qkv.bias" )
2849
+
2850
+ # map tensor names
2851
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
2852
+ if new_name is None :
2853
+ print (f"Can not map tensor { name !r} " )
2854
+ sys .exit ()
2855
+
2856
+ n_dims = len (data .shape )
2857
+ data_dtype = data .dtype
2858
+
2859
+ # if f32 desired, convert any float16 to float32
2860
+ if self .ftype == 0 and data_dtype == np .float16 :
2861
+ data = data .astype (np .float32 )
2862
+
2863
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2864
+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
2865
+ data = data .astype (np .float32 )
2866
+
2867
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2868
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
2869
+ data = data .astype (np .float16 )
2870
+
2871
+ print (f"=> { new_name } , shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2872
+
2873
+ self .gguf_writer .add_tensor (new_name , data )
2874
+
2875
+ if not has_lm_head and name == "word_embeddings.weight" :
2876
+ self .gguf_writer .add_tensor ("output.weight" , data )
2877
+ print (name , f"=> output.weight, shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2878
+
2879
+
2719
2880
###### CONVERSION LOGIC ######
2720
2881
2721
2882
0 commit comments