Sync codebase

hauntsaninja · hauntsaninja · commit affbd6e50da3 · 2023-05-31T20:03:03.000-07:00
diff --git a/tiktoken/core.py b/tiktoken/core.py
@@ -276,6 +276,31 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
         """
         return [self.decode_single_token_bytes(token) for token in tokens]
 
+    def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]:
+        """Decodes a list of tokens into a string and a list of offsets.
+
+        Each offset is the index into text corresponding to the start of each token.
+        If UTF-8 character boundaries do not line up with token boundaries, the offset is the index
+        of the first character that contains bytes from the token.
+
+        This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may
+        change in the future to be more permissive.
+
+        >>> enc.decode_with_offsets([31373, 995])
+        ('hello world', [0, 5])
+        """
+        token_bytes = self.decode_tokens_bytes(tokens)
+
+        text_len = 0
+        offsets = []
+        for token in token_bytes:
+            offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))
+            text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)
+
+        # TODO: assess correctness for errors="ignore" and errors="replace"
+        text = b"".join(token_bytes).decode("utf-8", errors="strict")
+        return text, offsets
+
     def decode_batch(
         self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
     ) -> list[str]:
diff --git a/tiktoken/model.py b/tiktoken/model.py
@@ -8,12 +8,14 @@
     # chat
     "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
     "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
+    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
 }
 
 MODEL_TO_ENCODING: dict[str, str] = {
     # chat
     "gpt-4": "cl100k_base",
     "gpt-3.5-turbo": "cl100k_base",
+    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
     # text
     "text-davinci-003": "p50k_base",
     "text-davinci-002": "p50k_base",
@@ -69,7 +71,7 @@ def encoding_for_model(model_name: str) -> Encoding:
     if encoding_name is None:
         raise KeyError(
             f"Could not automatically map {model_name} to a tokeniser. "
-            "Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
+            "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
         ) from None
 
     return get_encoding(encoding_name)