Skip to content

Commit a87ea02

Browse files
committed
Remove SentencePiece support
1 parent bb4206d commit a87ea02

File tree

3 files changed

+8
-72
lines changed

3 files changed

+8
-72
lines changed

exllamav2/tokenizer/spm.py

Lines changed: 0 additions & 57 deletions
This file was deleted.

exllamav2/tokenizer/tokenizer.py

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import os, json, re
66
from exllamav2.tokenizer import (
77
ExLlamaV2TokenizerBase,
8-
ExLlamaV2TokenizerSPM,
98
ExLlamaV2TokenizerHF
109
)
1110
import threading
@@ -93,13 +92,12 @@ def __init__(
9392
Defer initialization of some data structures to speed up loading
9493
9594
:param force_json:
96-
No effect from v0.2.3. tokenizer.json is now preferred over tokenizer.model by default.
97-
If True and no tokenizer.json is present in the model directory, will emit a warning before
98-
falling back to SPM
95+
No effect from v0.2.3. tokenizer.json is now preferred over tokenizer.model by default. From v0.3.1
96+
tokenizer.model is not used at all
9997
10098
:param force_spm:
101-
Use only tokenizer.model (SentencePiece) even if tokenizer.model (HF Tokenizers)
102-
is available
99+
Deprecated, Sentencepiece is abandoned and no longer supported. All SPM tokenizers should
100+
still load correctly via the Tokenizers library
103101
"""
104102

105103
self.config = config
@@ -123,18 +121,14 @@ def __init__(
123121

124122
# Detect tokenizer model type and initialize
125123

126-
path_spm = os.path.join(self.config.model_dir, "tokenizer.model")
124+
assert not force_spm, "tokenizer.py: force_spm is deprecated. Sentencepiece is no longer supported."
127125
path_hf = os.path.join(self.config.model_dir, "tokenizer.json")
128126

129-
if os.path.exists(path_hf) and not force_spm:
130-
self.tokenizer_model = ExLlamaV2TokenizerHF(path_hf)
131-
elif os.path.exists(path_spm):
132-
if force_json:
133-
print(" !! Warning: Tokenizer loading with force_json = True but no tokenizer.json found, falling back to tokenizer.model")
134-
self.tokenizer_model = ExLlamaV2TokenizerSPM(path_spm)
135-
else:
127+
if not os.path.exists(path_hf):
136128
raise FileNotFoundError("No supported tokenizer found.")
137129

130+
self.tokenizer_model = ExLlamaV2TokenizerHF(path_hf)
131+
138132
# Attempt to load added tokens from tokenizer.json
139133

140134
self.extended_piece_to_id = {}

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ setuptools
55
fastparquet
66
torch>=2.2.0
77
safetensors>=0.4.3
8-
sentencepiece>=0.1.97
98
pygments
109
websockets
1110
regex

0 commit comments

Comments
 (0)