Skip to content

Commit c6f15a7

Browse files
committed
Read vocabulary for ArcticForCausalLM from sentencepiece model instead of HF tokenizer.
Add/redefine tokens accordingly to added_tokens_decoder from tokenizer_config.json
1 parent c95013d commit c6f15a7

File tree

1 file changed

+81
-1
lines changed

1 file changed

+81
-1
lines changed

convert-hf-to-gguf.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1522,7 +1522,87 @@ class ArcticModel(Model):
15221522
model_arch = gguf.MODEL_ARCH.ARCTIC
15231523

15241524
def set_vocab(self):
1525-
self._set_vocab_llama_hf()
1525+
# The reason for using a custom implementation here is that the
1526+
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
1527+
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
1528+
from sentencepiece import SentencePieceProcessor
1529+
1530+
tokenizer_path = self.dir_model / 'tokenizer.model'
1531+
1532+
if not tokenizer_path.is_file():
1533+
print(f'Error: Missing {tokenizer_path}', file=sys.stderr)
1534+
sys.exit(1)
1535+
1536+
# Read the whole vocabulary from the tokenizer.model file
1537+
tokenizer = SentencePieceProcessor(str(tokenizer_path))
1538+
1539+
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
1540+
1541+
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
1542+
scores: list[float] = [-10000.0] * vocab_size
1543+
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
1544+
1545+
for token_id in range(tokenizer.vocab_size()):
1546+
1547+
piece = tokenizer.id_to_piece(token_id)
1548+
text = piece.encode("utf-8")
1549+
score = tokenizer.get_score(token_id)
1550+
1551+
toktype = SentencePieceTokenTypes.NORMAL
1552+
if tokenizer.is_unknown(token_id):
1553+
toktype = SentencePieceTokenTypes.UNKNOWN
1554+
elif tokenizer.is_control(token_id):
1555+
toktype = SentencePieceTokenTypes.CONTROL
1556+
elif tokenizer.is_unused(token_id):
1557+
toktype = SentencePieceTokenTypes.UNUSED
1558+
elif tokenizer.is_byte(token_id):
1559+
toktype = SentencePieceTokenTypes.BYTE
1560+
1561+
tokens[token_id] = text
1562+
scores[token_id] = score
1563+
toktypes[token_id] = toktype
1564+
1565+
# Use the added_tokens_decoder field from tokeniser_config.json as the source
1566+
# of information about added/redefined tokens and modify them accordingly.
1567+
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
1568+
if tokenizer_config_file.is_file():
1569+
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
1570+
tokenizer_config_json = json.load(f)
1571+
1572+
if "added_tokens_decoder" in tokenizer_config_json:
1573+
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
1574+
for token_id, token_json in added_tokens_decoder.items():
1575+
token_id = int(token_id)
1576+
if (token_id >= vocab_size):
1577+
print(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
1578+
continue
1579+
1580+
token_content = token_json["content"]
1581+
token_type = SentencePieceTokenTypes.USER_DEFINED
1582+
token_score = -10000.0
1583+
1584+
# Map unk_token to UNKNOWN, other special tokens to CONTROL
1585+
# Set the score to 0.0 as in the original tokenizer.model
1586+
if ("special" in token_json) and token_json["special"]:
1587+
if token_content == tokenizer_config_json["unk_token"]:
1588+
token_type = SentencePieceTokenTypes.UNKNOWN
1589+
else:
1590+
token_type = SentencePieceTokenTypes.CONTROL
1591+
token_score = 0.0
1592+
1593+
print(f"Setting token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
1594+
tokens[token_id] = token_content.encode("utf-8")
1595+
toktypes[token_id] = token_type
1596+
scores[token_id] = token_score
1597+
1598+
self.gguf_writer.add_tokenizer_model("llama")
1599+
self.gguf_writer.add_tokenizer_pre("default")
1600+
self.gguf_writer.add_token_list(tokens)
1601+
self.gguf_writer.add_token_scores(scores)
1602+
self.gguf_writer.add_token_types(toktypes)
1603+
1604+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
1605+
special_vocab.add_to_gguf(self.gguf_writer)
15261606

15271607
def set_gguf_parameters(self):
15281608
super().set_gguf_parameters()

0 commit comments

Comments
 (0)