Skip to content

Commit a793779

Browse files
committed
Sync codebase
1 parent 5d970c1 commit a793779

File tree

6 files changed

+68
-32
lines changed

6 files changed

+68
-32
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
This is the changelog for the open source version of tiktoken.
44

5+
## [v0.5.0]
6+
- Add `tiktoken._educational` submodule to better document how byte pair encoding works
7+
- Ensure `encoding_for_model` knows about several new models
8+
- Add `decode_with_offets`
9+
- Better error for failures with the plugin mechanism
10+
- Make more tests public
11+
- Update versions of dependencies
12+
513
## [v0.4.0]
614
- Add `decode_batch` and `decode_bytes_batch`
715
- Improve error messages and handling

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "tiktoken"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
edition = "2021"
55
rust-version = "1.57.0"
66

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "tiktoken"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
55
readme = "README.md"
66
license = {file = "LICENSE"}

tiktoken/_educational.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
11
"""This is an educational implementation of the byte pair encoding algorithm."""
2-
from __future__ import annotations
3-
42
import collections
5-
import itertools
63
from typing import Optional
74

85
import regex
@@ -187,11 +184,23 @@ def bpe_train(
187184

188185

189186
def visualise_tokens(token_values: list[bytes]) -> None:
190-
backgrounds = itertools.cycle(
191-
[f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
192-
)
193-
interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
194-
print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
187+
background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
188+
# If token boundaries do not occur at unicode character boundaries, it's unclear how best to
189+
# visualise the token. Here, we'll just use the unicode replacement character to represent some
190+
# fraction of a character.
191+
unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
192+
193+
running_length = 0
194+
last_color = None
195+
for token in unicode_token_values:
196+
color = background[running_length % len(background)]
197+
if color == last_color:
198+
color = background[(running_length + 1) % len(background)]
199+
assert color != last_color
200+
last_color = color
201+
running_length += len(token)
202+
print(color + token, end="")
203+
print("\u001b[0m")
195204

196205

197206
def train_simple_encoding():

tiktoken/model.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,30 @@
44
from .registry import get_encoding
55

66
# TODO: these will likely be replaced by an API endpoint
7-
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
7+
_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
88
# chat
99
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
1010
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
11-
"gpt-35-turbo": "cl100k_base", # Azure deployment name
11+
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
12+
# fine-tuned
13+
"ft:gpt-4": "cl100k_base",
14+
"ft:gpt-3.5-turbo": "cl100k_base",
15+
"ft:davinci-002": "cl100k_base",
16+
"ft:babbage-002": "cl100k_base",
1217
}
1318

14-
MODEL_TO_ENCODING: dict[str, str] = {
19+
_MODEL_TO_ENCODING: dict[str, str] = {
1520
# chat
1621
"gpt-4": "cl100k_base",
1722
"gpt-3.5-turbo": "cl100k_base",
1823
"gpt-35-turbo": "cl100k_base", # Azure deployment name
19-
# text
24+
# base
25+
"davinci-002": "cl100k_base",
26+
"babbage-002": "cl100k_base",
27+
# embeddings
28+
"text-embedding-ada-002": "cl100k_base",
29+
# DEPRECATED MODELS
30+
# text (DEPRECATED)
2031
"text-davinci-003": "p50k_base",
2132
"text-davinci-002": "p50k_base",
2233
"text-davinci-001": "r50k_base",
@@ -27,19 +38,17 @@
2738
"curie": "r50k_base",
2839
"babbage": "r50k_base",
2940
"ada": "r50k_base",
30-
# code
41+
# code (DEPRECATED)
3142
"code-davinci-002": "p50k_base",
3243
"code-davinci-001": "p50k_base",
3344
"code-cushman-002": "p50k_base",
3445
"code-cushman-001": "p50k_base",
3546
"davinci-codex": "p50k_base",
3647
"cushman-codex": "p50k_base",
37-
# edit
48+
# edit (DEPRECATED)
3849
"text-davinci-edit-001": "p50k_edit",
3950
"code-davinci-edit-001": "p50k_edit",
40-
# embeddings
41-
"text-embedding-ada-002": "cl100k_base",
42-
# old embeddings
51+
# old embeddings (DEPRECATED)
4352
"text-similarity-davinci-001": "r50k_base",
4453
"text-similarity-curie-001": "r50k_base",
4554
"text-similarity-babbage-001": "r50k_base",
@@ -58,13 +67,13 @@
5867
def encoding_for_model(model_name: str) -> Encoding:
5968
"""Returns the encoding used by a model."""
6069
encoding_name = None
61-
if model_name in MODEL_TO_ENCODING:
62-
encoding_name = MODEL_TO_ENCODING[model_name]
70+
if model_name in _MODEL_TO_ENCODING:
71+
encoding_name = _MODEL_TO_ENCODING[model_name]
6372
else:
6473
# Check if the model matches a known prefix
6574
# Prefix matching avoids needing library updates for every model version release
6675
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
67-
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
76+
for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
6877
if model_name.startswith(model_prefix):
6978
return get_encoding(model_encoding_name)
7079

tiktoken/registry.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from __future__ import annotations
22

3+
import functools
34
import importlib
45
import pkgutil
56
import threading
6-
from typing import Any, Callable, Optional
7+
from typing import Any, Callable, Optional, Sequence
78

89
import tiktoken_ext
910

@@ -14,21 +15,28 @@
1415
ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
1516

1617

18+
@functools.lru_cache()
19+
def _available_plugin_modules() -> Sequence[str]:
20+
# tiktoken_ext is a namespace package
21+
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
22+
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
23+
# - it's a separate top-level package because namespace subpackages of non-namespace
24+
# packages don't quite do what you want with editable installs
25+
mods = []
26+
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
27+
for _, mod_name, _ in plugin_mods:
28+
mods.append(mod_name)
29+
return mods
30+
31+
1732
def _find_constructors() -> None:
1833
global ENCODING_CONSTRUCTORS
1934
with _lock:
2035
if ENCODING_CONSTRUCTORS is not None:
2136
return
2237
ENCODING_CONSTRUCTORS = {}
2338

24-
# tiktoken_ext is a namespace package
25-
# submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
26-
# - we use namespace package pattern so `pkgutil.iter_modules` is fast
27-
# - it's a separate top-level package because namespace subpackages of non-namespace
28-
# packages don't quite do what you want with editable installs
29-
plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
30-
31-
for _, mod_name, _ in plugin_mods:
39+
for mod_name in _available_plugin_modules():
3240
mod = importlib.import_module(mod_name)
3341
try:
3442
constructors = mod.ENCODING_CONSTRUCTORS
@@ -57,7 +65,9 @@ def get_encoding(encoding_name: str) -> Encoding:
5765
assert ENCODING_CONSTRUCTORS is not None
5866

5967
if encoding_name not in ENCODING_CONSTRUCTORS:
60-
raise ValueError(f"Unknown encoding {encoding_name}")
68+
raise ValueError(
69+
f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}"
70+
)
6171

6272
constructor = ENCODING_CONSTRUCTORS[encoding_name]
6373
enc = Encoding(**constructor())

0 commit comments

Comments
 (0)