5
5
import os , json , re
6
6
from exllamav2 .tokenizer import (
7
7
ExLlamaV2TokenizerBase ,
8
- ExLlamaV2TokenizerSPM ,
9
8
ExLlamaV2TokenizerHF
10
9
)
11
10
import threading
@@ -93,13 +92,12 @@ def __init__(
93
92
Defer initialization of some data structures to speed up loading
94
93
95
94
:param force_json:
96
- No effect from v0.2.3. tokenizer.json is now preferred over tokenizer.model by default.
97
- If True and no tokenizer.json is present in the model directory, will emit a warning before
98
- falling back to SPM
95
+ No effect from v0.2.3. tokenizer.json is now preferred over tokenizer.model by default. From v0.3.1
96
+ tokenizer.model is not used at all
99
97
100
98
:param force_spm:
101
- Use only tokenizer.model (SentencePiece) even if tokenizer.model (HF Tokenizers)
102
- is available
99
+ Deprecated, Sentencepiece is abandoned and no longer supported. All SPM tokenizers should
100
+ still load correctly via the Tokenizers library
103
101
"""
104
102
105
103
self .config = config
@@ -123,18 +121,14 @@ def __init__(
123
121
124
122
# Detect tokenizer model type and initialize
125
123
126
- path_spm = os . path . join ( self . config . model_dir , "tokenizer.model" )
124
+ assert not force_spm , "tokenizer.py: force_spm is deprecated. Sentencepiece is no longer supported."
127
125
path_hf = os .path .join (self .config .model_dir , "tokenizer.json" )
128
126
129
- if os .path .exists (path_hf ) and not force_spm :
130
- self .tokenizer_model = ExLlamaV2TokenizerHF (path_hf )
131
- elif os .path .exists (path_spm ):
132
- if force_json :
133
- print (" !! Warning: Tokenizer loading with force_json = True but no tokenizer.json found, falling back to tokenizer.model" )
134
- self .tokenizer_model = ExLlamaV2TokenizerSPM (path_spm )
135
- else :
127
+ if not os .path .exists (path_hf ):
136
128
raise FileNotFoundError ("No supported tokenizer found." )
137
129
130
+ self .tokenizer_model = ExLlamaV2TokenizerHF (path_hf )
131
+
138
132
# Attempt to load added tokens from tokenizer.json
139
133
140
134
self .extended_piece_to_id = {}
0 commit comments