@@ -1522,7 +1522,87 @@ class ArcticModel(Model):
1522
1522
model_arch = gguf .MODEL_ARCH .ARCTIC
1523
1523
1524
1524
def set_vocab (self ):
1525
- self ._set_vocab_llama_hf ()
1525
+ # The reason for using a custom implementation here is that the
1526
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
1527
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
1528
+ from sentencepiece import SentencePieceProcessor
1529
+
1530
+ tokenizer_path = self .dir_model / 'tokenizer.model'
1531
+
1532
+ if not tokenizer_path .is_file ():
1533
+ print (f'Error: Missing { tokenizer_path } ' , file = sys .stderr )
1534
+ sys .exit (1 )
1535
+
1536
+ # Read the whole vocabulary from the tokenizer.model file
1537
+ tokenizer = SentencePieceProcessor (str (tokenizer_path ))
1538
+
1539
+ vocab_size = self .hparams .get ('vocab_size' , tokenizer .vocab_size ())
1540
+
1541
+ tokens : list [bytes ] = [f"[PAD{ i } ]" .encode ("utf-8" ) for i in range (vocab_size )]
1542
+ scores : list [float ] = [- 10000.0 ] * vocab_size
1543
+ toktypes : list [int ] = [SentencePieceTokenTypes .UNKNOWN ] * vocab_size
1544
+
1545
+ for token_id in range (tokenizer .vocab_size ()):
1546
+
1547
+ piece = tokenizer .id_to_piece (token_id )
1548
+ text = piece .encode ("utf-8" )
1549
+ score = tokenizer .get_score (token_id )
1550
+
1551
+ toktype = SentencePieceTokenTypes .NORMAL
1552
+ if tokenizer .is_unknown (token_id ):
1553
+ toktype = SentencePieceTokenTypes .UNKNOWN
1554
+ elif tokenizer .is_control (token_id ):
1555
+ toktype = SentencePieceTokenTypes .CONTROL
1556
+ elif tokenizer .is_unused (token_id ):
1557
+ toktype = SentencePieceTokenTypes .UNUSED
1558
+ elif tokenizer .is_byte (token_id ):
1559
+ toktype = SentencePieceTokenTypes .BYTE
1560
+
1561
+ tokens [token_id ] = text
1562
+ scores [token_id ] = score
1563
+ toktypes [token_id ] = toktype
1564
+
1565
+ # Use the added_tokens_decoder field from tokeniser_config.json as the source
1566
+ # of information about added/redefined tokens and modify them accordingly.
1567
+ tokenizer_config_file = self .dir_model / 'tokenizer_config.json'
1568
+ if tokenizer_config_file .is_file ():
1569
+ with open (tokenizer_config_file , "r" , encoding = "utf-8" ) as f :
1570
+ tokenizer_config_json = json .load (f )
1571
+
1572
+ if "added_tokens_decoder" in tokenizer_config_json :
1573
+ added_tokens_decoder = tokenizer_config_json ["added_tokens_decoder" ]
1574
+ for token_id , token_json in added_tokens_decoder .items ():
1575
+ token_id = int (token_id )
1576
+ if (token_id >= vocab_size ):
1577
+ print (f'ignore token { token_id } : id is out of range, max={ vocab_size - 1 } ' )
1578
+ continue
1579
+
1580
+ token_content = token_json ["content" ]
1581
+ token_type = SentencePieceTokenTypes .USER_DEFINED
1582
+ token_score = - 10000.0
1583
+
1584
+ # Map unk_token to UNKNOWN, other special tokens to CONTROL
1585
+ # Set the score to 0.0 as in the original tokenizer.model
1586
+ if ("special" in token_json ) and token_json ["special" ]:
1587
+ if token_content == tokenizer_config_json ["unk_token" ]:
1588
+ token_type = SentencePieceTokenTypes .UNKNOWN
1589
+ else :
1590
+ token_type = SentencePieceTokenTypes .CONTROL
1591
+ token_score = 0.0
1592
+
1593
+ print (f"Setting token { token_id } to '{ token_content } ' (type: { token_type } , score: { token_score :.2f} )" )
1594
+ tokens [token_id ] = token_content .encode ("utf-8" )
1595
+ toktypes [token_id ] = token_type
1596
+ scores [token_id ] = token_score
1597
+
1598
+ self .gguf_writer .add_tokenizer_model ("llama" )
1599
+ self .gguf_writer .add_tokenizer_pre ("default" )
1600
+ self .gguf_writer .add_token_list (tokens )
1601
+ self .gguf_writer .add_token_scores (scores )
1602
+ self .gguf_writer .add_token_types (toktypes )
1603
+
1604
+ special_vocab = gguf .SpecialVocab (self .dir_model , n_vocab = len (tokens ))
1605
+ special_vocab .add_to_gguf (self .gguf_writer )
1526
1606
1527
1607
def set_gguf_parameters (self ):
1528
1608
super ().set_gguf_parameters ()
0 commit comments