Skip to content

Commit fee4232

Browse files
Support input splitting for openai tokenizers
1 parent 262e9a7 commit fee4232

14 files changed

+316
-188
lines changed

crates/bpe-openai/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@ bench = false
1414

1515
[dependencies]
1616
bpe = { version = "0.1.0", path = "../bpe" }
17+
either = "1.13"
18+
fancy-regex = "0.13"
1719
rmp-serde = "1"
1820
serde = { version = "1" }
1921

2022
[dev-dependencies]
21-
fancy-regex = "0.13"
2223
tiktoken-rs = { version = "0.5" }
2324

2425
[build-dependencies]

crates/bpe-openai/README.md

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
55
The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
66
For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
77

8-
Supported token sets:
8+
Supported tokenizers:
99

1010
- r50k
1111
- p50k
1212
- cl100k
1313
- o200k
1414

15-
> **⚠ CAUTION ⚠**
16-
> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
17-
> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
18-
1915
## Usage
2016

2117
Add a dependency by running

crates/bpe-openai/src/lib.rs

Lines changed: 79 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,102 @@
11
use std::sync::LazyLock;
22

33
use bpe::byte_pair_encoding::BytePairEncoding;
4+
use either::Either;
5+
use fancy_regex::Regex;
46

5-
static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
7+
static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
68
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
7-
rmp_serde::from_slice(bytes).expect("")
9+
let bpe = rmp_serde::from_slice(bytes).expect("");
10+
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
11+
Tokenizer::new(bpe, Some(pat)).unwrap()
812
});
913

10-
static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
14+
static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
1115
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
12-
rmp_serde::from_slice(bytes).expect("")
16+
let bpe = rmp_serde::from_slice(bytes).expect("");
17+
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
18+
Tokenizer::new(bpe, Some(pat)).unwrap()
1319
});
1420

15-
static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
21+
static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
1622
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
17-
rmp_serde::from_slice(bytes).expect("")
23+
let bpe = rmp_serde::from_slice(bytes).expect("");
24+
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
25+
Tokenizer::new(bpe, Some(pat)).unwrap()
1826
});
1927

20-
static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
28+
static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
2129
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
22-
rmp_serde::from_slice(bytes).expect("")
30+
let bpe = rmp_serde::from_slice(bytes).expect("");
31+
let pat = [
32+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
33+
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
34+
"\\p{N}{1,3}",
35+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
36+
"\\s*[\\r\\n]+",
37+
"\\s+(?!\\S)",
38+
"\\s+",
39+
].join("|");
40+
Tokenizer::new(bpe, Some(&pat)).unwrap()
2341
});
2442

2543
pub use bpe::*;
2644

27-
pub fn r50k() -> &'static BytePairEncoding {
45+
pub struct Tokenizer {
46+
/// The byte-pair encoding for this tokenizer.
47+
pub bpe: BytePairEncoding,
48+
/// The pattern regex used to split the input.
49+
pub pat: Option<Regex>,
50+
}
51+
52+
impl Tokenizer {
53+
pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
54+
let pat = pat.map(|pat| fancy_regex::Regex::new(pat)).transpose()?;
55+
Ok(Self { bpe, pat })
56+
}
57+
58+
pub fn count(&self, text: &str) -> usize {
59+
self.split(text)
60+
.map(|piece| self.bpe.count(piece.as_bytes()))
61+
.sum()
62+
}
63+
64+
pub fn encode(&self, text: &str) -> Vec<u32> {
65+
self.split(text)
66+
.flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
67+
.collect()
68+
}
69+
70+
pub fn decode(&self, tokens: &[u32]) -> Option<String> {
71+
String::from_utf8(self.bpe.decode_tokens(tokens)).ok()
72+
}
73+
74+
pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
75+
match &self.pat {
76+
Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
77+
let m = m.expect("match succeeded");
78+
assert_eq!(*start, m.start(), "pattern should match all input text");
79+
*start = m.end();
80+
Some(m.as_str())
81+
})),
82+
None => Either::Right(std::iter::once(text)),
83+
}
84+
}
85+
}
86+
87+
pub fn r50k() -> &'static Tokenizer {
2888
&BPE_R50K
2989
}
3090

31-
pub fn p50k() -> &'static BytePairEncoding {
91+
pub fn p50k() -> &'static Tokenizer {
3292
&BPE_P50K
3393
}
3494

35-
pub fn cl100k() -> &'static BytePairEncoding {
95+
pub fn cl100k() -> &'static Tokenizer {
3696
&BPE_CL100K
3797
}
3898

39-
pub fn o200k() -> &'static BytePairEncoding {
99+
pub fn o200k() -> &'static Tokenizer {
40100
&BPE_O200K
41101
}
42102

@@ -48,25 +108,25 @@ mod tests {
48108

49109
#[test]
50110
fn can_load_r50k() {
51-
r50k().count("".as_bytes());
111+
r50k().count("");
52112
}
53113

54114
#[test]
55115
fn can_load_p50k() {
56-
p50k().count("".as_bytes());
116+
p50k().count("");
57117
}
58118

59119
#[test]
60120
fn can_load_cl100k() {
61-
cl100k().count("".as_bytes());
121+
cl100k().count("");
62122
}
63123

64124
#[test]
65125
fn can_load_o200k() {
66-
o200k().count("".as_bytes());
126+
o200k().count("");
67127
}
68128

69-
/// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
129+
/// Test demonstrating a case where input splitting makes a difference.
70130
#[test]
71131
fn splitting_difference() {
72132
let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
@@ -78,20 +138,10 @@ mod tests {
78138
.map(|i| i as u32)
79139
.collect();
80140

81-
let without_splitting = BPE_CL100K.encode_via_backtracking(input);
141+
let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
82142
assert_ne!(without_splitting, expected);
83143

84-
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
85-
let re = fancy_regex::Regex::new(pat).unwrap();
86-
println!("{}", re.find_iter(text).count());
87-
let with_splitting: Vec<_> = re
88-
.find_iter(text)
89-
.flat_map(|piece| {
90-
BPE_CL100K
91-
.encode_via_backtracking(piece.unwrap().as_str().as_bytes())
92-
.into_iter()
93-
})
94-
.collect();
144+
let with_splitting: Vec<_> = BPE_CL100K.encode(text);
95145
assert_eq!(with_splitting, expected);
96146
}
97147
}

crates/bpe/README.md

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,18 +221,11 @@ The benchmark measured the runtime of encoding of slices of lengths 10, 100, 100
221221

222222
The graph below shows encoding runtime vs slice length.
223223
All encoders (except the heap encoder) show the expected linear runtime complexity.
224-
The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken.
225-
The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs).
224+
The fully dynamic programming solution and the heap implementation are still quite competitive to the backtracking encoder.
226225
If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners.
227226

228227
![encoding runtime comparison](./images/performance-encoding.svg)
229228

230-
The graph below shows encoding results for input that is particularly challenging for tiktoken.
231-
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
232-
This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.
233-
234-
![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
235-
236229
### Incremental encoding
237230

238231
Incremental encoding tokenizes a text while appending bytes.
@@ -266,6 +259,29 @@ The interval encoder counts any interval in typically constant time.
266259

267260
![counting runtime comparison](./images/performance-counting.svg)
268261

262+
### Comparison with other tokenizers
263+
264+
We compared the encoding performance of our encoder with two popular implementations, tiktoken and Huggingface tokenizers.
265+
266+
The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set.
267+
In this benchmark, our own encoder includes a pre-tokenization step so that it produces exactly the same results as the other two.
268+
(All encodings were computed from scratch for each slice.)
269+
270+
The graph below shows encoding runtime vs slice length.
271+
All encoders (except the heap encoder) show the expected linear runtime complexity.
272+
The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken.
273+
The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs).
274+
If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners.
275+
276+
![encoding runtime comparison](./images/performance-comparison.svg)
277+
278+
The graph below shows encoding results for input that is particularly challenging for tiktoken.
279+
The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
280+
The performance of tiktoken suffers shows a quadratic growth with the input size.
281+
The Huggingface encoder scales better, but at a slower pace than our own encoder.
282+
283+
![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
284+
269285
### Running the benchmarks
270286

271287
Benchmarks are located in a separate crate in the `benchmarks` directory.

crates/bpe/benchmarks/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ edition = "2021"
44

55
[lib]
66
path = "lib.rs"
7+
test = false
78

89
[[bench]]
910
name = "performance"
@@ -14,7 +15,6 @@ test = false
1415
[[test]]
1516
name = "equivalence"
1617
path = "equivalence.rs"
17-
harness = true
1818
test = true
1919

2020
[dependencies]

crates/bpe/benchmarks/equivalence.rs

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@ use bpe_benchmarks::*;
33
#[test]
44
fn test_encoding_equivalence() {
55
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
6-
let text = create_test_string(bpe, 20000);
6+
let text = create_test_string(&bpe.bpe, 20000);
77
let inputs = (0..32)
88
.map(|_| select_test_bytes(text.as_bytes(), 100))
99
.chain(std::iter::once(
1010
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
1111
));
1212
for input in inputs {
1313
let text = std::str::from_utf8(input).unwrap();
14-
let out = bpe.encode_via_backtracking(input);
14+
let out = bpe.encode(text);
1515
let tiktoken_out: Vec<_> = tiktoken.encode_ordinary(text);
1616
let tiktoken_out2: Vec<_> = tiktoken_out.iter().map(|i| *i as u32).collect();
1717
let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
@@ -25,27 +25,26 @@ fn test_encoding_equivalence() {
2525
if tiktoken_out2 != huggingface_out {
2626
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
2727
if tiktoken_text != huggingface_text {
28-
eprintln!(
28+
panic!(
2929
"huggingface tokens and text differ: {:?} != {:?}",
3030
huggingface_text, tiktoken_text
3131
);
3232
} else {
33-
eprintln!(
33+
panic!(
3434
"huggingface tokens differ: {:?} != {:?}",
3535
huggingface_out, tiktoken_out2
3636
);
3737
}
3838
}
3939
if tiktoken_out2 != out {
40-
let output = bpe.decode_tokens(&out);
41-
let text = std::str::from_utf8(&output).unwrap();
40+
let text = bpe.decode(&out).unwrap();
4241
if tiktoken_text != text {
43-
eprintln!(
42+
panic!(
4443
"bpe tokens and text differ: {:?} != {:?}",
4544
text, tiktoken_text
4645
);
4746
} else {
48-
eprintln!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
47+
panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
4948
}
5049
}
5150
}

crates/bpe/benchmarks/lib.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
use std::sync::LazyLock;
22

33
use bpe::byte_pair_encoding::BytePairEncoding;
4+
use bpe_openai::Tokenizer;
45
use rand::{thread_rng, Rng};
5-
use tiktoken_rs::CoreBPE as TiktokenBPE;
6+
use tiktoken_rs::CoreBPE as TiktokenTokenizer;
67
use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;
78

89
pub static TOKENIZERS: LazyLock<
910
[(
1011
&'static str,
11-
&'static BytePairEncoding,
12-
TiktokenBPE,
12+
&'static Tokenizer,
13+
TiktokenTokenizer,
1314
HuggingfaceTokenizer,
1415
); 2],
1516
> = LazyLock::new(|| {

0 commit comments

Comments
 (0)