File tree Expand file tree Collapse file tree 1 file changed +7
-1
lines changed Expand file tree Collapse file tree 1 file changed +7
-1
lines changed Original file line number Diff line number Diff line change @@ -967,7 +967,13 @@ def set_vocab(self):
967
967
from transformers import AutoTokenizer
968
968
tokenizer = AutoTokenizer .from_pretrained (dir_model )
969
969
vocab_size = hparams .get ("vocab_size" , len (tokenizer .vocab ))
970
- assert max (tokenizer .vocab .values ()) < vocab_size
970
+ # Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
971
+ # because vocab_size is the count of items, and indexes start at 0.
972
+ max_vocab_index = max (tokenizer .get_vocab ().values ())
973
+ if max_vocab_index >= vocab_size :
974
+ raise ValueError ("Vocabulary size exceeds expected maximum size." )
975
+
976
+
971
977
972
978
reverse_vocab : dict [int , str ] = {id_ : encoded_tok for encoded_tok , id_ in tokenizer .vocab .items ()}
973
979
added_vocab = tokenizer .get_added_vocab ()
You can’t perform that action at this time.
0 commit comments