From 777ce736798f4063a7de2fed38bbcebcb5d42825 Mon Sep 17 00:00:00 2001 From: Shantanu Date: Thu, 13 Feb 2025 20:38:11 -0800 Subject: [PATCH] Partial sync of codebase --- tiktoken/core.py | 36 +++++++++++++++++++++++++++++++++++- tiktoken/load.py | 2 +- tiktoken/model.py | 5 +++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tiktoken/core.py b/tiktoken/core.py index 6dcdc32..6bc9736 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -2,12 +2,16 @@ import functools from concurrent.futures import ThreadPoolExecutor -from typing import AbstractSet, Collection, Literal, NoReturn, Sequence +from typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence import regex from tiktoken import _tiktoken +if TYPE_CHECKING: + import numpy as np + import numpy.typing as npt + class Encoding: def __init__( @@ -128,6 +132,32 @@ def encode( text = text.encode("utf-16", "surrogatepass").decode("utf-16", "replace") return self._core_bpe.encode(text, allowed_special) + def encode_to_numpy( + self, + text: str, + *, + allowed_special: Literal["all"] | AbstractSet[str] = set(), # noqa: B006 + disallowed_special: Literal["all"] | Collection[str] = "all", + ) -> npt.NDArray[np.uint32]: + """Encodes a string into tokens, returning a numpy array. + + Avoids the overhead of copying the token buffer into a Python list. + """ + if allowed_special == "all": + allowed_special = self.special_tokens_set + if disallowed_special == "all": + disallowed_special = self.special_tokens_set - allowed_special + if disallowed_special: + if not isinstance(disallowed_special, frozenset): + disallowed_special = frozenset(disallowed_special) + if match := _special_token_regex(disallowed_special).search(text): + raise_disallowed_special_token(match.group()) + + import numpy as np + + buffer = self._core_bpe.encode_to_tiktoken_buffer(text, self.special_tokens_set) + return np.frombuffer(buffer, dtype=np.uint32) + def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]: """Encodes a list of strings into tokens, in parallel, ignoring special tokens. @@ -332,6 +362,10 @@ def eot_token(self) -> int: def special_tokens_set(self) -> set[str]: return set(self._special_tokens.keys()) + def is_special_token(self, token: int) -> bool: + assert isinstance(token, int) + return token in self._special_token_values + @property def n_vocab(self) -> int: """For backwards compatibility. Prefer to use `enc.max_token_value + 1`.""" diff --git a/tiktoken/load.py b/tiktoken/load.py index 5686528..295deb9 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -154,5 +154,5 @@ def load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None = None) token, rank = line.split() ret[base64.b64decode(token)] = int(rank) except Exception as e: - raise ValueError(f"Error parsing line {line} in {tiktoken_bpe_file}") from e + raise ValueError(f"Error parsing line {line!r} in {tiktoken_bpe_file}") from e return ret diff --git a/tiktoken/model.py b/tiktoken/model.py index 681b913..4298ae7 100644 --- a/tiktoken/model.py +++ b/tiktoken/model.py @@ -6,6 +6,7 @@ # TODO: these will likely be replaced by an API endpoint MODEL_PREFIX_TO_ENCODING: dict[str, str] = { "o1-": "o200k_base", + "o3-": "o200k_base", # chat "chatgpt-4o-": "o200k_base", "gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13 @@ -13,6 +14,7 @@ "gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc. "gpt-35-turbo-": "cl100k_base", # Azure deployment name # fine-tuned + "ft:gpt-4o": "o200k_base", "ft:gpt-4": "cl100k_base", "ft:gpt-3.5-turbo": "cl100k_base", "ft:davinci-002": "cl100k_base", @@ -20,6 +22,9 @@ } MODEL_TO_ENCODING: dict[str, str] = { + # reasoning + "o1": "o200k_base", + "o3": "o200k_base", # chat "gpt-4o": "o200k_base", "gpt-4": "cl100k_base",