From 777ce736798f4063a7de2fed38bbcebcb5d42825 Mon Sep 17 00:00:00 2001
From: Shantanu <shantanu@openai.com>
Date: Thu, 13 Feb 2025 20:38:11 -0800
Subject: [PATCH] Partial sync of codebase

---
 tiktoken/core.py  | 36 +++++++++++++++++++++++++++++++++++-
 tiktoken/load.py  |  2 +-
 tiktoken/model.py |  5 +++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/tiktoken/core.py b/tiktoken/core.py
index 6dcdc32..6bc9736 100644
--- a/tiktoken/core.py
+++ b/tiktoken/core.py
@@ -2,12 +2,16 @@
 
 import functools
 from concurrent.futures import ThreadPoolExecutor
-from typing import AbstractSet, Collection, Literal, NoReturn, Sequence
+from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence
 
 import regex
 
 from tiktoken import _tiktoken
 
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+
 
 class Encoding:
     def __init__(
@@ -128,6 +132,32 @@ def encode(
             text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace")
             return self._core_bpe.encode(text, allowed_special)
 
+    def encode_to_numpy(
+        self,
+        text: str,
+        *,
+        allowed_special: Literal["all"] | AbstractSet[str] = set(),  # noqa: B006
+        disallowed_special: Literal["all"] | Collection[str] = "all",
+    ) -> npt.NDArray[np.uint32]:
+        """Encodes a string into tokens, returning a numpy array.
+
+        Avoids the overhead of copying the token buffer into a Python list.
+        """
+        if allowed_special == "all":
+            allowed_special = self.special_tokens_set
+        if disallowed_special == "all":
+            disallowed_special = self.special_tokens_set - allowed_special
+        if disallowed_special:
+            if not isinstance(disallowed_special, frozenset):
+                disallowed_special = frozenset(disallowed_special)
+            if match := _special_token_regex(disallowed_special).search(text):
+                raise_disallowed_special_token(match.group())
+
+        import numpy as np
+
+        buffer = self._core_bpe.encode_to_tiktoken_buffer(text, self.special_tokens_set)
+        return np.frombuffer(buffer, dtype=np.uint32)
+
     def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
         """Encodes a list of strings into tokens, in parallel, ignoring special tokens.
 
@@ -332,6 +362,10 @@ def eot_token(self) -> int:
     def special_tokens_set(self) -> set[str]:
         return set(self._special_tokens.keys())
 
+    def is_special_token(self, token: int) -> bool:
+        assert isinstance(token, int)
+        return token in self._special_token_values
+
     @property
     def n_vocab(self) -> int:
         """For backwards compatibility. Prefer to use `enc.max_token_value + 1`."""
diff --git a/tiktoken/load.py b/tiktoken/load.py
index 5686528..295deb9 100644
--- a/tiktoken/load.py
+++ b/tiktoken/load.py
@@ -154,5 +154,5 @@ def load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None = None)
             token, rank = line.split()
             ret[base64.b64decode(token)] = int(rank)
         except Exception as e:
-            raise ValueError(f"Error parsing line {line} in {tiktoken_bpe_file}") from e
+            raise ValueError(f"Error parsing line {line!r} in {tiktoken_bpe_file}") from e
     return ret
diff --git a/tiktoken/model.py b/tiktoken/model.py
index 681b913..4298ae7 100644
--- a/tiktoken/model.py
+++ b/tiktoken/model.py
@@ -6,6 +6,7 @@
 # TODO: these will likely be replaced by an API endpoint
 MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
     "o1-": "o200k_base",
+    "o3-": "o200k_base",
     # chat
     "chatgpt-4o-": "o200k_base",
     "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
@@ -13,6 +14,7 @@
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
     "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
     # fine-tuned
+    "ft:gpt-4o": "o200k_base",
     "ft:gpt-4": "cl100k_base",
     "ft:gpt-3.5-turbo": "cl100k_base",
     "ft:davinci-002": "cl100k_base",
@@ -20,6 +22,9 @@
 }
 
 MODEL_TO_ENCODING: dict[str, str] = {
+    # reasoning
+    "o1": "o200k_base",
+    "o3": "o200k_base",
     # chat
     "gpt-4o": "o200k_base",
     "gpt-4": "cl100k_base",