Skip to content

Commit affbd6e

Browse files
committed
Sync codebase
1 parent 095924e commit affbd6e

File tree

2 files changed

+28
-1
lines changed

2 files changed

+28
-1
lines changed

tiktoken/core.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,31 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
276276
"""
277277
return [self.decode_single_token_bytes(token) for token in tokens]
278278

279+
def decode_with_offsets(self, tokens: list[int]) -> tuple[str, list[int]]:
280+
"""Decodes a list of tokens into a string and a list of offsets.
281+
282+
Each offset is the index into text corresponding to the start of each token.
283+
If UTF-8 character boundaries do not line up with token boundaries, the offset is the index
284+
of the first character that contains bytes from the token.
285+
286+
This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may
287+
change in the future to be more permissive.
288+
289+
>>> enc.decode_with_offsets([31373, 995])
290+
('hello world', [0, 5])
291+
"""
292+
token_bytes = self.decode_tokens_bytes(tokens)
293+
294+
text_len = 0
295+
offsets = []
296+
for token in token_bytes:
297+
offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))
298+
text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)
299+
300+
# TODO: assess correctness for errors="ignore" and errors="replace"
301+
text = b"".join(token_bytes).decode("utf-8", errors="strict")
302+
return text, offsets
303+
279304
def decode_batch(
280305
self, batch: list[list[int]], *, errors: str = "replace", num_threads: int = 8
281306
) -> list[str]:

tiktoken/model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@
88
# chat
99
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
1010
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
11+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
1112
}
1213

1314
MODEL_TO_ENCODING: dict[str, str] = {
1415
# chat
1516
"gpt-4": "cl100k_base",
1617
"gpt-3.5-turbo": "cl100k_base",
18+
"gpt-35-turbo": "cl100k_base", # Azure deployment name
1719
# text
1820
"text-davinci-003": "p50k_base",
1921
"text-davinci-002": "p50k_base",
@@ -69,7 +71,7 @@ def encoding_for_model(model_name: str) -> Encoding:
6971
if encoding_name is None:
7072
raise KeyError(
7173
f"Could not automatically map {model_name} to a tokeniser. "
72-
"Please use `tiktok.get_encoding` to explicitly get the tokeniser you expect."
74+
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
7375
) from None
7476

7577
return get_encoding(encoding_name)

0 commit comments

Comments
 (0)