@@ -387,6 +387,7 @@ def __init__(
387
387
self .bpe_tokenizer = json .loads (
388
388
open (str (fname_tokenizer ), encoding = "utf-8" ).read ()
389
389
)
390
+ self .vocab = self .bpe_tokenizer ["model" ]["vocab" ]
390
391
added_tokens : dict [str , int ]
391
392
if fname_added_tokens is not None :
392
393
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
@@ -405,7 +406,7 @@ def __init__(
405
406
if item ["content" ] not in self .bpe_tokenizer
406
407
)
407
408
408
- vocab_size : int = len (self .bpe_tokenizer )
409
+ vocab_size : int = len (self .vocab )
409
410
expected_ids = list (range (vocab_size , vocab_size + len (added_tokens )))
410
411
actual_ids = sorted (added_tokens .values ())
411
412
if expected_ids != actual_ids :
@@ -415,17 +416,17 @@ def __init__(
415
416
)
416
417
417
418
items = sorted (added_tokens .items (), key = lambda text_idx : text_idx [1 ])
419
+ self .added_tokens_dict = added_tokens
418
420
self .added_tokens_list = [text for (text , idx ) in items ]
419
421
self .vocab_size_base : int = vocab_size
420
422
self .vocab_size : int = self .vocab_size_base + len (self .added_tokens_list )
421
423
self .fname_tokenizer = fname_tokenizer
422
424
self .fname_added_tokens = fname_added_tokens
423
425
424
426
def bpe_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
425
- tokenizer = self .bpe_tokenizer
426
- reverse_vocab = {id : encoded_tok for encoded_tok , id in tokenizer .items ()}
427
+ reverse_vocab = {id : encoded_tok for encoded_tok , id in self .vocab .items ()}
427
428
428
- for i , _ in enumerate (tokenizer ):
429
+ for i , _ in enumerate (self . vocab ):
429
430
yield reverse_vocab [i ], 0.0 , gguf .TokenType .NORMAL
430
431
431
432
def added_tokens (self ) -> Iterable [tuple [bytes , float , gguf .TokenType ]]:
@@ -466,6 +467,7 @@ def __init__(
466
467
)
467
468
468
469
# Token pieces that were added to the base vocabulary.
470
+ self .added_tokens_dict = added_tokens
469
471
self .added_tokens_list = [new_tokens [id ] for id in actual_new_ids ]
470
472
self .vocab_size_base = vocab_size
471
473
self .vocab_size = self .vocab_size_base + len (self .added_tokens_list )
@@ -1006,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
1006
1008
)
1007
1009
for i in range (1 , pad_count + 1 ):
1008
1010
vocab .added_tokens_dict [f"<dummy{ i :05} >" ] = - 1
1011
+ vocab .added_tokens_list .append (f"<dummy{ i :05} >" )
1009
1012
vocab .vocab_size = params .n_vocab
1010
1013
return
1011
1014
@@ -1097,6 +1100,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]
1097
1100
scores .append (score )
1098
1101
toktypes .append (toktype )
1099
1102
1103
+ assert (len (tokens ) == vocab .vocab_size )
1104
+
1100
1105
return tokens , scores , toktypes
1101
1106
1102
1107
def add_meta_vocab (self , vocab : Vocab ) -> None :
@@ -1373,15 +1378,14 @@ def _detect_files(self):
1373
1378
self .files [file ] = file_path
1374
1379
elif parent_file_path .exists ():
1375
1380
self .files [file ] = parent_file_path
1381
+ print (f"Found vocab files: { self .files } " )
1376
1382
1377
1383
def _select_file (self , vocabtype : Optional [str ]) -> Path :
1378
1384
if vocabtype in ["spm" , "bpe" ]:
1379
- # For SentencePiece and BPE, return specific files as before
1380
- file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
1381
- if self .files [file_key ]:
1382
- return self .files [file_key ]
1383
- else :
1384
- raise FileNotFoundError (f"{ vocabtype } { file_key } not found." )
1385
+ for file_key in self .files .keys ():
1386
+ if self .files [file_key ]:
1387
+ return self .files [file_key ]
1388
+ raise FileNotFoundError (f"{ vocabtype } vocab not found." )
1385
1389
elif vocabtype == "hfft" :
1386
1390
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
1387
1391
return self .path
0 commit comments