Bump version, sync codebase

hauntsaninja · hauntsaninja · commit 7830ed537bad · 2023-02-03T12:35:59.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 This is the changelog for the open source version of tiktoken.
 
+## [v0.2.0]
+- Add ``tiktoken.encoding_for_model`` to get the encoding for a specific model
+- Improve portability of caching logic
+
 ## [v0.1.2]
 - Avoid use of `blobfile` for public files
 - Add support for Python 3.8
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tiktoken"
-version = "0.1.0"
+version = "0.2.0"
 edition = "2021"
 rust-version = "1.57.0"
 
diff --git a/Makefile b/Makefile
diff --git a/README.md b/README.md
@@ -7,6 +7,9 @@ OpenAI's models.
 import tiktoken
 enc = tiktoken.get_encoding("gpt2")
 assert enc.decode(enc.encode("hello world")) == "hello world"
+
+# To get the tokeniser corresponding to a specific model in the OpenAI API:
+enc = tiktoken.encoding_for_model("text-davinci-003")
 ```
 
 The open source version of `tiktoken` can be installed from PyPI:
@@ -16,7 +19,9 @@ pip install tiktoken
 
 The tokeniser API is documented in `tiktoken/core.py`.
 
-Example code using `tiktoken` can be found in the [OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
+Example code using `tiktoken` can be found in the
+[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).
+
 
 ## Performance
 
@@ -28,3 +33,72 @@ Performance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2Tokeni
 `tokenizers==0.13.2` and `transformers==4.24.0`.
 
 
+## Getting help
+
+Please post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).
+
+If you work at OpenAI, make sure to check the internal documentation or feel free to contact
+@shantanu.
+
+
+## Extending tiktoken
+
+You may wish to extend `tiktoken` to support new encodings. There are two ways to do this.
+
+
+**Create your `Encoding` object exactly the way you want and simply pass it around.**
+
+```python
+cl100k_base = tiktoken.get_encoding("cl100k_base")
+
+# In production, load the arguments directly instead of accessing private attributes
+# See openai_public.py for examples of arguments for specific encodings
+enc = tiktoken.Encoding(
+    # If you're changing the set of special tokens, make sure to use a different name
+    # It should be clear from the name what behaviour to expect.
+    name="cl100k_im",
+    pat_str=cl100k_base._pat_str,
+    mergeable_ranks=cl100k_base._mergeable_ranks,
+    special_tokens={
+        **cl100k_base._special_tokens,
+        "<|im_start|>": 100264,
+        "<|im_end|>": 100265,
+    }
+)
+```
+
+**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**
+
+This is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer
+option 1.
+
+To do this, you'll need to create a namespace package under `tiktoken_ext`.
+
+Layout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:
+```
+my_tiktoken_extension
+├── tiktoken_ext
+│   └── my_encodings.py
+└── setup.py
+```
+
+`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.
+This is a dictionary from an encoding name to a function that takes no arguments and returns
+arguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see
+`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.
+
+Your `setup.py` should look something like this:
+```python
+from setuptools import setup, find_namespace_packages
+
+setup(
+    name="my_tiktoken_extension",
+    packages=find_namespace_packages(include=['tiktoken_ext.*'])
+    install_requires=["tiktoken"],
+    ...
+)
+```
+
+Then simply `pip install my_tiktoken_extension` and you should be able to use your custom encodings!
+Make sure **not** to use an editable install.
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,12 @@
 [project]
 name = "tiktoken"
+version = "0.2.0"
 dependencies = ["blobfile>=2", "regex>=2022.1.18", "requests>=2.26.0"]
-dynamic = ["version"]
 requires-python = ">=3.8"
 
 [build-system]
 build-backend = "setuptools.build_meta"
-requires = ["setuptools>=61", "wheel", "setuptools-rust>=1.3"]
+requires = ["setuptools>=62.4", "wheel", "setuptools-rust>=1.5.2"]
 
 [tool.cibuildwheel]
 build-frontend = "build"
diff --git a/setup.py b/setup.py
@@ -1,14 +1,8 @@
 from setuptools import setup
 from setuptools_rust import Binding, RustExtension
 
-public = True
-
-if public:
-    version = "0.1.2"
-
 setup(
     name="tiktoken",
-    version=version,
     rust_extensions=[
         RustExtension(
             "tiktoken._tiktoken",
diff --git a/tests/test_simple_public.py b/tests/test_simple_public.py
@@ -17,3 +17,10 @@ def test_simple():
         enc = tiktoken.get_encoding(enc_name)
         for token in range(10_000):
             assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token
+
+
+def test_encoding_for_model():
+    enc = tiktoken.encoding_for_model("gpt2")
+    assert enc.name == "gpt2"
+    enc = tiktoken.encoding_for_model("text-davinci-003")
+    assert enc.name == "p50k_base"
diff --git a/tiktoken/__init__.py b/tiktoken/__init__.py
@@ -1,3 +1,4 @@
 from .core import Encoding as Encoding
+from .model import encoding_for_model as encoding_for_model
 from .registry import get_encoding as get_encoding
 from .registry import list_encoding_names as list_encoding_names
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -19,6 +19,21 @@ def __init__(
         special_tokens: dict[str, int],
         explicit_n_vocab: Optional[int] = None,
     ):
+        """Creates an Encoding object.
+
+        See openai_public.py for examples of how to construct an Encoding object.
+
+        Args:
+            name: The name of the encoding. It should be clear from the name of the encoding
+                what behaviour to expect, in particular, encodings with different special tokens
+                should have different names.
+            pat_str: A regex pattern string that is used to split the input text.
+            mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks
+                must correspond to merge priority.
+            special_tokens: A dictionary mapping special token strings to their token values.
+            explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked
+                that the number of mergeable tokens and special tokens is equal to this number.
+        """
         self.name = name
 
         self._pat_str = pat_str
diff --git a/tiktoken/load.py b/tiktoken/load.py
@@ -4,6 +4,7 @@
 import hashlib
 import json
 import os
+import tempfile
 import uuid
 
 import blobfile
@@ -24,7 +25,7 @@ def read_file_cached(blobpath: str) -> bytes:
     elif "DATA_GYM_CACHE_DIR" in os.environ:
         cache_dir = os.environ["DATA_GYM_CACHE_DIR"]
     else:
-        cache_dir = "/tmp/data-gym-cache"
+        cache_dir = os.path.join(tempfile.gettempdir(), "data-gym-cache")
 
     if cache_dir == "":
         # disable caching
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -0,0 +1,55 @@
+from __future__ import annotations
+
+from .core import Encoding
+from .registry import get_encoding
+
+# TODO: this will likely be replaced by an API endpoint
+MODEL_TO_ENCODING: dict[str, str] = {
+    # text
+    "text-davinci-003": "p50k_base",
+    "text-davinci-002": "p50k_base",
+    "text-davinci-001": "r50k_base",
+    "text-curie-001": "r50k_base",
+    "text-babbage-001": "r50k_base",
+    "text-ada-001": "r50k_base",
+    "davinci": "r50k_base",
+    "curie": "r50k_base",
+    "babbage": "r50k_base",
+    "ada": "r50k_base",
+    # code
+    "code-davinci-002": "p50k_base",
+    "code-davinci-001": "p50k_base",
+    "code-cushman-002": "p50k_base",
+    "code-cushman-001": "p50k_base",
+    "davinci-codex": "p50k_base",
+    "cushman-codex": "p50k_base",
+    # edit
+    "text-davinci-edit-001": "p50k_edit",
+    "code-davinci-edit-001": "p50k_edit",
+    # embeddings
+    "text-embedding-ada-002": "cl100k_base",
+    # old embeddings
+    "text-similarity-davinci-001": "r50k_base",
+    "text-similarity-curie-001": "r50k_base",
+    "text-similarity-babbage-001": "r50k_base",
+    "text-similarity-ada-001": "r50k_base",
+    "text-search-davinci-doc-001": "r50k_base",
+    "text-search-curie-doc-001": "r50k_base",
+    "text-search-babbage-doc-001": "r50k_base",
+    "text-search-ada-doc-001": "r50k_base",
+    "code-search-babbage-code-001": "r50k_base",
+    "code-search-ada-code-001": "r50k_base",
+    # open source
+    "gpt2": "gpt2",
+}
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    try:
+        encoding_name = MODEL_TO_ENCODING[model_name]
+    except KeyError:
+        raise KeyError(
+            f"Could not automatically map {model_name} to a tokeniser. "
+            "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
+        ) from None
+    return get_encoding(encoding_name)
diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py
@@ -47,6 +47,19 @@ def p50k_base():
     }
 
 
+def p50k_edit():
+    mergeable_ranks = load_tiktoken_bpe(
+        "https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken"
+    )
+    special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}
+    return {
+        "name": "p50k_edit",
+        "pat_str": r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
+        "mergeable_ranks": mergeable_ranks,
+        "special_tokens": special_tokens,
+    }
+
+
 def cl100k_base():
     mergeable_ranks = load_tiktoken_bpe(
         "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"