Sync codebase

hauntsaninja · hauntsaninja · commit a7937793e746 · 2023-09-11T14:50:52.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,14 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.5.0]
+- Add `tiktoken._educational` submodule to better document how byte pair encoding works
+- Ensure `encoding_for_model` knows about several new models
+- Add `decode_with_offets`
+- Better error for failures with the plugin mechanism
+- Make more tests public
+- Update versions of dependencies
+
 ## [v0.4.0]
 - Add `decode_batch` and `decode_bytes_batch`
 - Improve error messages and handling
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.4.0"
+version = "0.5.0"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "tiktoken"
-version = "0.4.0"
+version = "0.5.0"
 description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models"
 readme = "README.md"
 license = {file = "LICENSE"}
diff --git a/tiktoken/_educational.py b/tiktoken/_educational.py
@@ -1,8 +1,5 @@
 """This is an educational implementation of the byte pair encoding algorithm."""
-from __future__ import annotations
-
 import collections
-import itertools
 from typing import Optional
 
 import regex
@@ -187,11 +184,23 @@ def bpe_train(
 
 
 def visualise_tokens(token_values: list[bytes]) -> None:
-    backgrounds = itertools.cycle(
-        [f"\u001b[48;5;{i}m".encode() for i in [167, 179, 185, 77, 80, 68, 134]]
-    )
-    interleaved = itertools.chain.from_iterable(zip(backgrounds, token_values))
-    print((b"".join(interleaved) + "\u001b[0m".encode()).decode("utf-8"))
+    background = [f"\u001b[48;5;{i}m" for i in [167, 179, 185, 77, 80, 68, 134]]
+    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to
+    # visualise the token. Here, we'll just use the unicode replacement character to represent some
+    # fraction of a character.
+    unicode_token_values = [x.decode("utf-8", errors="replace") for x in token_values]
+
+    running_length = 0
+    last_color = None
+    for token in unicode_token_values:
+        color = background[running_length % len(background)]
+        if color == last_color:
+            color = background[(running_length + 1) % len(background)]
+            assert color != last_color
+        last_color = color
+        running_length += len(token)
+        print(color + token, end="")
+    print("\u001b[0m")
 
 
 def train_simple_encoding():
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -4,19 +4,30 @@
 from .registry import get_encoding
 
 # TODO: these will likely be replaced by an API endpoint
-MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+_MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
-    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
+    "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
+    # fine-tuned
+    "ft:gpt-4": "cl100k_base",
+    "ft:gpt-3.5-turbo": "cl100k_base",
+    "ft:davinci-002": "cl100k_base",
+    "ft:babbage-002": "cl100k_base",
 }
 
-MODEL_TO_ENCODING: dict[str, str] = {
+_MODEL_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
     "gpt-35-turbo": "cl100k_base",  # Azure deployment name
-    # text
+    # base
+    "davinci-002": "cl100k_base",
+    "babbage-002": "cl100k_base",
+    # embeddings
+    "text-embedding-ada-002": "cl100k_base",
+    # DEPRECATED MODELS
+    # text (DEPRECATED)
     "text-davinci-003": "p50k_base",
     "text-davinci-002": "p50k_base",
     "text-davinci-001": "r50k_base",
@@ -27,19 +38,17 @@
     "curie": "r50k_base",
     "babbage": "r50k_base",
     "ada": "r50k_base",
-    # code
+    # code (DEPRECATED)
     "code-davinci-002": "p50k_base",
     "code-davinci-001": "p50k_base",
     "code-cushman-002": "p50k_base",
     "code-cushman-001": "p50k_base",
     "davinci-codex": "p50k_base",
     "cushman-codex": "p50k_base",
-    # edit
+    # edit (DEPRECATED)
     "text-davinci-edit-001": "p50k_edit",
     "code-davinci-edit-001": "p50k_edit",
-    # embeddings
-    "text-embedding-ada-002": "cl100k_base",
-    # old embeddings
+    # old embeddings (DEPRECATED)
     "text-similarity-davinci-001": "r50k_base",
     "text-similarity-curie-001": "r50k_base",
     "text-similarity-babbage-001": "r50k_base",
@@ -58,13 +67,13 @@
 def encoding_for_model(model_name: str) -> Encoding:
     """Returns the encoding used by a model."""
     encoding_name = None
-    if model_name in MODEL_TO_ENCODING:
-        encoding_name = MODEL_TO_ENCODING[model_name]
+    if model_name in _MODEL_TO_ENCODING:
+        encoding_name = _MODEL_TO_ENCODING[model_name]
     else:
         # Check if the model matches a known prefix
         # Prefix matching avoids needing library updates for every model version release
         # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
-        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
+        for model_prefix, model_encoding_name in _MODEL_PREFIX_TO_ENCODING.items():
             if model_name.startswith(model_prefix):
                 return get_encoding(model_encoding_name)
 
diff --git a/tiktoken/registry.py b/tiktoken/registry.py
@@ -1,9 +1,10 @@
 from __future__ import annotations
 
+import functools
 import importlib
 import pkgutil
 import threading
-from typing import Any, Callable, Optional
+from typing import Any, Callable, Optional, Sequence
 
 import tiktoken_ext
 
@@ -14,21 +15,28 @@
 ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
 
 
+@functools.lru_cache()
+def _available_plugin_modules() -> Sequence[str]:
+    # tiktoken_ext is a namespace package
+    # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
+    # - we use namespace package pattern so `pkgutil.iter_modules` is fast
+    # - it's a separate top-level package because namespace subpackages of non-namespace
+    #   packages don't quite do what you want with editable installs
+    mods = []
+    plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
+    for _, mod_name, _ in plugin_mods:
+        mods.append(mod_name)
+    return mods
+
+
 def _find_constructors() -> None:
     global ENCODING_CONSTRUCTORS
     with _lock:
         if ENCODING_CONSTRUCTORS is not None:
             return
         ENCODING_CONSTRUCTORS = {}
 
-        # tiktoken_ext is a namespace package
-        # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes
-        # - we use namespace package pattern so `pkgutil.iter_modules` is fast
-        # - it's a separate top-level package because namespace subpackages of non-namespace
-        #   packages don't quite do what you want with editable installs
-        plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + ".")
-
-        for _, mod_name, _ in plugin_mods:
+        for mod_name in _available_plugin_modules():
             mod = importlib.import_module(mod_name)
             try:
                 constructors = mod.ENCODING_CONSTRUCTORS
@@ -57,7 +65,9 @@ def get_encoding(encoding_name: str) -> Encoding:
             assert ENCODING_CONSTRUCTORS is not None
 
         if encoding_name not in ENCODING_CONSTRUCTORS:
-            raise ValueError(f"Unknown encoding {encoding_name}")
+            raise ValueError(
+                f"Unknown encoding {encoding_name}. Plugins found: {_available_plugin_modules()}"
+            )
 
         constructor = ENCODING_CONSTRUCTORS[encoding_name]
         enc = Encoding(**constructor())