Skip to content

Commit 4f4bf35

Browse files
authored
py : fix missing added_tokens_dict for SPM and BPE vocabs (#4971)
* py : fix missing added_tokens_dict for SPM vocab * py : pad with unknown tokens when data is missing ggml-ci * py : fix BPE vocab conversion ggml-ci * py : fix padded dummy tokens (I hope)
1 parent 2b3a665 commit 4f4bf35

File tree

1 file changed

+14
-10
lines changed

1 file changed

+14
-10
lines changed

convert.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ def __init__(
387387
self.bpe_tokenizer = json.loads(
388388
open(str(fname_tokenizer), encoding="utf-8").read()
389389
)
390+
self.vocab = self.bpe_tokenizer["model"]["vocab"]
390391
added_tokens: dict[str, int]
391392
if fname_added_tokens is not None:
392393
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
@@ -405,7 +406,7 @@ def __init__(
405406
if item["content"] not in self.bpe_tokenizer
406407
)
407408

408-
vocab_size: int = len(self.bpe_tokenizer)
409+
vocab_size: int = len(self.vocab)
409410
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
410411
actual_ids = sorted(added_tokens.values())
411412
if expected_ids != actual_ids:
@@ -415,17 +416,17 @@ def __init__(
415416
)
416417

417418
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
419+
self.added_tokens_dict = added_tokens
418420
self.added_tokens_list = [text for (text, idx) in items]
419421
self.vocab_size_base: int = vocab_size
420422
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
421423
self.fname_tokenizer = fname_tokenizer
422424
self.fname_added_tokens = fname_added_tokens
423425

424426
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
425-
tokenizer = self.bpe_tokenizer
426-
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
427+
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
427428

428-
for i, _ in enumerate(tokenizer):
429+
for i, _ in enumerate(self.vocab):
429430
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
430431

431432
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -466,6 +467,7 @@ def __init__(
466467
)
467468

468469
# Token pieces that were added to the base vocabulary.
470+
self.added_tokens_dict = added_tokens
469471
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
470472
self.vocab_size_base = vocab_size
471473
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
@@ -1006,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
10061008
)
10071009
for i in range(1, pad_count + 1):
10081010
vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
1011+
vocab.added_tokens_list.append(f"<dummy{i:05}>")
10091012
vocab.vocab_size = params.n_vocab
10101013
return
10111014

@@ -1097,6 +1100,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]
10971100
scores.append(score)
10981101
toktypes.append(toktype)
10991102

1103+
assert(len(tokens) == vocab.vocab_size)
1104+
11001105
return tokens, scores, toktypes
11011106

11021107
def add_meta_vocab(self, vocab: Vocab) -> None:
@@ -1373,15 +1378,14 @@ def _detect_files(self):
13731378
self.files[file] = file_path
13741379
elif parent_file_path.exists():
13751380
self.files[file] = parent_file_path
1381+
print(f"Found vocab files: {self.files}")
13761382

13771383
def _select_file(self, vocabtype: Optional[str]) -> Path:
13781384
if vocabtype in ["spm", "bpe"]:
1379-
# For SentencePiece and BPE, return specific files as before
1380-
file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
1381-
if self.files[file_key]:
1382-
return self.files[file_key]
1383-
else:
1384-
raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
1385+
for file_key in self.files.keys():
1386+
if self.files[file_key]:
1387+
return self.files[file_key]
1388+
raise FileNotFoundError(f"{vocabtype} vocab not found.")
13851389
elif vocabtype == "hfft":
13861390
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
13871391
return self.path

0 commit comments

Comments
 (0)