@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
80
80
if not self .is_safetensors :
81
81
self .part_names = Model .get_model_part_names (self .dir_model , "pytorch_model" , ".bin" )
82
82
self .hparams = Model .load_hparams (self .dir_model )
83
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
83
+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
84
84
self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
85
85
self .tensor_names = None
86
86
if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2725,6 +2725,167 @@ def write_tensors(self):
2725
2725
raise ValueError (f"Unprocessed experts: { experts } " )
2726
2726
2727
2727
2728
+ @Model .register ("ChatGLMModel" )
2729
+ class ChatGLMModel (Model ):
2730
+ model_arch = gguf .MODEL_ARCH .CHATGLM
2731
+
2732
+ def set_vocab (self ):
2733
+ dir_model = self .dir_model
2734
+ hparams = self .hparams
2735
+ tokens : list [bytearray ] = []
2736
+ toktypes : list [int ] = []
2737
+ scores : list [float ] = []
2738
+
2739
+ from transformers import AutoTokenizer
2740
+ tokenizer = AutoTokenizer .from_pretrained (dir_model , trust_remote_code = True )
2741
+ vocab_size = hparams .get ("padded_vocab_size" , len (tokenizer .get_vocab ()))
2742
+ assert max (tokenizer .get_vocab ().values ()) < vocab_size
2743
+
2744
+ reverse_vocab = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .get_vocab ().items ()}
2745
+
2746
+ for token_id in range (vocab_size ):
2747
+ piece = tokenizer ._convert_id_to_token (token_id )
2748
+ if token_id == 0 :
2749
+ piece = "<unk>"
2750
+ elif token_id == 1 :
2751
+ piece = "<bos>"
2752
+ elif token_id == 2 :
2753
+ piece = "<eos>"
2754
+
2755
+ text = piece .encode ("utf-8" )
2756
+ score = 0.0
2757
+ if len (piece ) != 0 and token_id < 64789 :
2758
+ score = tokenizer .tokenizer .sp_model .get_score (token_id )
2759
+
2760
+ if len (piece ) == 0 :
2761
+ text = f"[PAD{ token_id } ]" .encode ("utf-8" )
2762
+
2763
+ if token_id >= 64789 :
2764
+ toktype = SentencePieceTokenTypes .UNKNOWN
2765
+ tokens .append (text )
2766
+ scores .append (score )
2767
+ toktypes .append (toktype )
2768
+ continue
2769
+
2770
+ toktype = SentencePieceTokenTypes .NORMAL
2771
+ if tokenizer .tokenizer .sp_model .is_unknown (token_id ):
2772
+ toktype = SentencePieceTokenTypes .UNKNOWN
2773
+ elif tokenizer .tokenizer .sp_model .is_control (token_id ):
2774
+ toktype = SentencePieceTokenTypes .CONTROL
2775
+ elif tokenizer .tokenizer .sp_model .is_unused (token_id ):
2776
+ toktype = SentencePieceTokenTypes .UNUSED
2777
+ elif tokenizer .tokenizer .sp_model .is_byte (token_id ):
2778
+ toktype = SentencePieceTokenTypes .BYTE
2779
+
2780
+ tokens .append (text )
2781
+ scores .append (score )
2782
+ toktypes .append (toktype )
2783
+
2784
+ self .gguf_writer .add_tokenizer_model ("llama" )
2785
+ self .gguf_writer .add_token_list (tokens )
2786
+ self .gguf_writer .add_token_scores (scores )
2787
+ self .gguf_writer .add_token_types (toktypes )
2788
+
2789
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2790
+ special_vocab .add_to_gguf (self .gguf_writer )
2791
+
2792
+ def set_gguf_parameters (self ):
2793
+ self .gguf_writer .add_name ("ChatGLM-6b-chat" )
2794
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2795
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2796
+ n_head_kv = self .hparams .get ("multi_query_group_num" , n_head )
2797
+ self .gguf_writer .add_context_length (self .hparams .get ("seq_length" , n_embed ))
2798
+ self .gguf_writer .add_embedding_length (n_embed )
2799
+ self .gguf_writer .add_feed_forward_length (self .hparams .get ("ffn_hidden_size" , 4 * n_embed ))
2800
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2801
+ self .gguf_writer .add_head_count (n_head )
2802
+ self .gguf_writer .add_head_count_kv (n_head_kv )
2803
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layernorm_epsilon" ])
2804
+ self .gguf_writer .add_file_type (self .ftype )
2805
+ self .gguf_writer .add_rope_dimension_count (64 )
2806
+ self .gguf_writer .add_add_bos_token (False )
2807
+
2808
+ def write_tensors (self ):
2809
+ block_count = self .hparams ["num_layers" ]
2810
+ tensors = dict (self .get_tensors ())
2811
+ tensor_map = gguf .get_tensor_name_map (self .model_arch , block_count )
2812
+ has_lm_head = True
2813
+ n_head = self .hparams .get ("n_head" , self .hparams .get ("num_attention_heads" ))
2814
+ n_embed = self .hparams .get ("hidden_size" , self .hparams .get ("n_embed" ))
2815
+
2816
+ for name , data_torch in tensors .items ():
2817
+ if name .endswith (".rotary_pos_emb.inv_freq" ):
2818
+ continue
2819
+
2820
+ if "lm_head.weight" not in tensors .keys () and "output.weight" not in tensors .keys ():
2821
+ has_lm_head = False
2822
+
2823
+ name = re .sub (r'transformer\.' , '' , name )
2824
+
2825
+ old_dtype = data_torch .dtype
2826
+
2827
+ # convert any unsupported data types to float32
2828
+ if data_torch .dtype not in (torch .float16 , torch .float32 ):
2829
+ data_torch = data_torch .to (torch .float32 )
2830
+
2831
+ data = data_torch .squeeze ().numpy ()
2832
+
2833
+ if re .match (r"h\.\d+\.self_attention\.query_key_value\.weight" , name ):
2834
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
2835
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
2836
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
2837
+ qkv_weights = data .reshape ((n_head , 3 , n_embed // n_head , n_embed ))
2838
+ data = np .concatenate (
2839
+ (
2840
+ qkv_weights [:, 0 , :, :].reshape ((- 1 , n_embed )),
2841
+ qkv_weights [:, 1 , :, :].reshape ((- 1 , n_embed )),
2842
+ qkv_weights [:, 2 , :, :].reshape ((- 1 , n_embed )),
2843
+ ),
2844
+ axis = 0 ,
2845
+ )
2846
+ print ("re-format attention.linear_qkv.weight" )
2847
+ elif re .match (r"h\.\d+\.self_attention\.query_key_value\.bias" , name ):
2848
+ qkv_bias = data .reshape ((n_head , 3 , n_embed // n_head ))
2849
+ data = np .concatenate (
2850
+ (
2851
+ qkv_bias [:, 0 , :].reshape ((n_embed ,)),
2852
+ qkv_bias [:, 1 , :].reshape ((n_embed ,)),
2853
+ qkv_bias [:, 2 , :].reshape ((n_embed ,)),
2854
+ ),
2855
+ axis = 0 ,
2856
+ )
2857
+ print ("re-format attention.linear_qkv.bias" )
2858
+
2859
+ # map tensor names
2860
+ new_name = tensor_map .get_name (name , try_suffixes = (".weight" , ".bias" ))
2861
+ if new_name is None :
2862
+ print (f"Can not map tensor { name !r} " )
2863
+ sys .exit ()
2864
+
2865
+ n_dims = len (data .shape )
2866
+ data_dtype = data .dtype
2867
+
2868
+ # if f32 desired, convert any float16 to float32
2869
+ if self .ftype == 0 and data_dtype == np .float16 :
2870
+ data = data .astype (np .float32 )
2871
+
2872
+ # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
2873
+ if self .ftype == 1 and data_dtype == np .float16 and n_dims == 1 :
2874
+ data = data .astype (np .float32 )
2875
+
2876
+ # if f16 desired, convert any float32 2-dim weight tensors to float16
2877
+ if self .ftype == 1 and data_dtype == np .float32 and name .endswith (".weight" ) and n_dims == 2 :
2878
+ data = data .astype (np .float16 )
2879
+
2880
+ print (f"=> { new_name } , shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2881
+
2882
+ self .gguf_writer .add_tensor (new_name , data )
2883
+
2884
+ if not has_lm_head and name == "word_embeddings.weight" :
2885
+ self .gguf_writer .add_tensor ("output.weight" , data )
2886
+ print (name , f"=> output.weight, shape = { data .shape } , { old_dtype } --> { data .dtype } " )
2887
+
2888
+
2728
2889
###### CONVERSION LOGIC ######
2729
2890
2730
2891
0 commit comments