@@ -80,7 +80,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
80
80
if not self .is_safetensors :
81
81
self .part_names = Model .get_model_part_names (self .dir_model , "pytorch_model" , ".bin" )
82
82
self .hparams = Model .load_hparams (self .dir_model )
83
- self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" ])
83
+ self .block_count = self .find_hparam (["n_layers" , "num_hidden_layers" , "n_layer" , "num_layers" ])
84
84
self .tensor_map = gguf .get_tensor_name_map (self .model_arch , self .block_count )
85
85
self .tensor_names = None
86
86
if self .ftype == gguf .LlamaFileType .GUESSED :
@@ -2768,6 +2768,124 @@ def write_tensors(self):
2768
2768
raise ValueError (f"Unprocessed experts: { experts } " )
2769
2769
2770
2770
2771
+ @Model .register ("T5ForConditionalGeneration" )
2772
+ @Model .register ("T5WithLMHeadModel" )
2773
+ class T5Model (Model ):
2774
+ model_arch = gguf .MODEL_ARCH .T5
2775
+
2776
+ def set_vocab (self ):
2777
+ # to avoid TypeError: Descriptors cannot be created directly
2778
+ # exception when importing sentencepiece_model_pb2
2779
+ os .environ ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION" ] = "python"
2780
+ from sentencepiece import SentencePieceProcessor
2781
+ from sentencepiece import sentencepiece_model_pb2 as model
2782
+
2783
+ tokenizer_path = self .dir_model / 'spiece.model'
2784
+
2785
+ if not tokenizer_path .is_file ():
2786
+ raise FileNotFoundError (f"File not found: { tokenizer_path } " )
2787
+
2788
+ sentencepiece_model = model .ModelProto ()
2789
+ sentencepiece_model .ParseFromString (open (tokenizer_path , "rb" ).read ())
2790
+ add_prefix = sentencepiece_model .normalizer_spec .add_dummy_prefix
2791
+ remove_whitespaces = sentencepiece_model .normalizer_spec .remove_extra_whitespaces
2792
+ precompiled_charsmap = sentencepiece_model .normalizer_spec .precompiled_charsmap
2793
+ assert sentencepiece_model .trainer_spec .model_type == 1 # UNIGRAM
2794
+
2795
+ tokenizer = SentencePieceProcessor ()
2796
+ tokenizer .LoadFromFile (str (tokenizer_path ))
2797
+
2798
+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
2799
+
2800
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
2801
+ scores : list [float ] = [- 10000.0 ] * vocab_size
2802
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
2803
+
2804
+ for token_id in range (tokenizer .vocab_size ()):
2805
+ piece = tokenizer .IdToPiece (token_id )
2806
+ text = piece .encode ("utf-8" )
2807
+ score = tokenizer .GetScore (token_id )
2808
+
2809
+ toktype = SentencePieceTokenTypes .NORMAL
2810
+ if tokenizer .IsUnknown (token_id ):
2811
+ toktype = SentencePieceTokenTypes .UNKNOWN
2812
+ elif tokenizer .IsControl (token_id ):
2813
+ toktype = SentencePieceTokenTypes .CONTROL
2814
+ elif tokenizer .IsUnused (token_id ):
2815
+ toktype = SentencePieceTokenTypes .UNUSED
2816
+ elif tokenizer .IsByte (token_id ):
2817
+ toktype = SentencePieceTokenTypes .BYTE
2818
+
2819
+ tokens [token_id ] = text
2820
+ scores [token_id ] = score
2821
+ toktypes [token_id ] = toktype
2822
+
2823
+ added_tokens_file = self .dir_model / 'added_tokens.json'
2824
+ if added_tokens_file .is_file ():
2825
+ with open (added_tokens_file , "r" , encoding = "utf-8" ) as f :
2826
+ added_tokens_json = json .load (f )
2827
+ for key in added_tokens_json :
2828
+ token_id = added_tokens_json [key ]
2829
+ if (token_id >= vocab_size ):
2830
+ logger .warning (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
2831
+ continue
2832
+
2833
+ tokens [token_id ] = key .encode ("utf-8" )
2834
+ scores [token_id ] = - 1000.0
2835
+ toktypes [token_id ] = SentencePieceTokenTypes .USER_DEFINED
2836
+
2837
+ if vocab_size > len (tokens ):
2838
+ pad_count = vocab_size - len (tokens )
2839
+ logger .debug (f"Padding vocab with { pad_count } token(s) - [PAD1] through [PAD{ pad_count } ]" )
2840
+ for i in range (1 , pad_count + 1 ):
2841
+ tokens .append (bytes (f"[PAD{ i } ]" , encoding = "utf-8" ))
2842
+ scores .append (- 1000.0 )
2843
+ toktypes .append (SentencePieceTokenTypes .UNUSED )
2844
+
2845
+ self .gguf_writer .add_tokenizer_model ("t5" )
2846
+ self .gguf_writer .add_tokenizer_pre ("default" )
2847
+ self .gguf_writer .add_token_list (tokens )
2848
+ self .gguf_writer .add_token_scores (scores )
2849
+ self .gguf_writer .add_token_types (toktypes )
2850
+ self .gguf_writer .add_add_space_prefix (add_prefix )
2851
+ self .gguf_writer .add_remove_extra_whitespaces (remove_whitespaces )
2852
+ if precompiled_charsmap :
2853
+ self .gguf_writer .add_precompiled_charsmap (precompiled_charsmap )
2854
+
2855
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
2856
+ special_vocab .add_to_gguf (self .gguf_writer )
2857
+
2858
+ self .gguf_writer .add_add_bos_token (False )
2859
+ self .gguf_writer .add_add_eos_token (True )
2860
+
2861
+ def set_gguf_parameters (self ):
2862
+ self .gguf_writer .add_name ("T5" )
2863
+ self .gguf_writer .add_context_length (self .hparams ["n_positions" ])
2864
+ self .gguf_writer .add_embedding_length (self .hparams ["d_model" ])
2865
+ self .gguf_writer .add_feed_forward_length (self .hparams ["d_ff" ])
2866
+ self .gguf_writer .add_block_count (self .hparams ["num_layers" ])
2867
+ self .gguf_writer .add_head_count (self .hparams ["num_heads" ])
2868
+ self .gguf_writer .add_key_length (self .hparams ["d_kv" ])
2869
+ self .gguf_writer .add_value_length (self .hparams ["d_kv" ])
2870
+ self .gguf_writer .add_layer_norm_eps (self .hparams ["layer_norm_epsilon" ])
2871
+ self .gguf_writer .add_relative_attn_buckets_count (self .hparams ["relative_attention_num_buckets" ])
2872
+ self .gguf_writer .add_layer_norm_rms_eps (self .hparams ["layer_norm_epsilon" ])
2873
+ self .gguf_writer .add_decoder_start_token_id (self .hparams ["decoder_start_token_id" ])
2874
+ self .gguf_writer .add_file_type (self .ftype )
2875
+
2876
+ def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
2877
+ del bid # unused
2878
+
2879
+ # Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
2880
+ # "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
2881
+ # To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
2882
+ if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight" :
2883
+ logger .debug (f"Skipping tensor { name !r} in safetensors so that convert can end normally." )
2884
+ return []
2885
+
2886
+ return [(self .map_tensor_name (name ), data_torch )]
2887
+
2888
+
2771
2889
###### CONVERSION LOGIC ######
2772
2890
2773
2891
0 commit comments