Skip to content

Commit 25e9a4f

Browse files
author
Lőrinc
committed
Add test for encoding really big byte sequences
1 parent 1b9faf2 commit 25e9a4f

File tree

1 file changed

+11
-9
lines changed

1 file changed

+11
-9
lines changed

tests/test_encoding.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import pytest
88

99
import tiktoken
10-
1110
from .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES
1211

1312

@@ -61,13 +60,16 @@ def test_simple_regex():
6160
def test_basic_encode():
6261
enc = tiktoken.get_encoding("r50k_base")
6362
assert enc.encode("hello world") == [31373, 995]
63+
assert enc.encode("a" * 1000) == [24794] * 250
6464

6565
enc = tiktoken.get_encoding("p50k_base")
6666
assert enc.encode("hello world") == [31373, 995]
67+
assert enc.encode("a" * 1000) == [24794] * 250
6768

6869
enc = tiktoken.get_encoding("cl100k_base")
6970
assert enc.encode("hello world") == [15339, 1917]
7071
assert enc.encode(" \x850") == [220, 126, 227, 15]
72+
assert enc.encode("a" * 1000) == [70540] * 125
7173

7274

7375
def test_encode_empty():
@@ -100,14 +102,14 @@ def test_encode_surrogate_pairs():
100102
def test_basic_roundtrip(make_enc):
101103
enc = make_enc()
102104
for value in (
103-
"hello",
104-
"hello ",
105-
"hello ",
106-
" hello",
107-
" hello ",
108-
" hello ",
109-
"hello world",
110-
"请考试我的软件!12345",
105+
"hello",
106+
"hello ",
107+
"hello ",
108+
" hello",
109+
" hello ",
110+
" hello ",
111+
"hello world",
112+
"请考试我的软件!12345",
111113
):
112114
assert value == enc.decode(enc.encode(value))
113115
assert value == enc.decode(enc.encode_ordinary(value))

0 commit comments

Comments
 (0)