Skip to content

Commit f0c61bf

Browse files
Add huggingface without pre-tokenization
1 parent fee4232 commit f0c61bf

File tree

3 files changed

+68
-8
lines changed

3 files changed

+68
-8
lines changed

crates/bpe/benchmarks/equivalence.rs

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,50 @@
11
use bpe_benchmarks::*;
22

3+
#[cfg(test)]
4+
const N: usize = 32;
5+
6+
#[test]
7+
fn test_encoding_equivalence_without_pretokenization() {
8+
for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
9+
let huggingface = without_pretokenizer(huggingface);
10+
let text = create_test_string(&bpe.bpe, 20000);
11+
let inputs = (0..N)
12+
.map(|_| select_test_bytes(text.as_bytes(), 100))
13+
.chain(std::iter::once(
14+
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
15+
));
16+
for input in inputs {
17+
let text = std::str::from_utf8(input).unwrap();
18+
let out = bpe.bpe.encode_via_backtracking(input);
19+
let huggingface_out: Vec<_> = huggingface
20+
.encode_fast(text, false)
21+
.unwrap()
22+
.get_ids()
23+
.to_vec();
24+
if huggingface_out != out {
25+
let text = bpe.decode(&out).unwrap();
26+
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
27+
if huggingface_text != text {
28+
panic!(
29+
"huggingface tokens and text differ: {:?} != {:?}",
30+
text, huggingface_text
31+
);
32+
} else {
33+
panic!(
34+
"huggingface tokens differ: {:?} != {:?}",
35+
out, huggingface_out
36+
);
37+
}
38+
}
39+
}
40+
}
41+
}
42+
343
#[test]
4-
fn test_encoding_equivalence() {
44+
fn test_encoding_equivalence_with_pretokenization() {
545
for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
646
let text = create_test_string(&bpe.bpe, 20000);
7-
let inputs = (0..32)
47+
let inputs = (0..N)
848
.map(|_| select_test_bytes(text.as_bytes(), 100))
949
.chain(std::iter::once(
1050
"You should see the Greek word 'kosme': \"κόσμε\"".as_bytes(),
@@ -19,9 +59,7 @@ fn test_encoding_equivalence() {
1959
.encode_fast(text, false)
2060
.unwrap()
2161
.get_ids()
22-
.iter()
23-
.copied()
24-
.collect();
62+
.to_vec();
2563
if tiktoken_out2 != huggingface_out {
2664
let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
2765
if tiktoken_text != huggingface_text {

crates/bpe/benchmarks/lib.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use bpe::byte_pair_encoding::BytePairEncoding;
44
use bpe_openai::Tokenizer;
55
use rand::{thread_rng, Rng};
66
use tiktoken_rs::CoreBPE as TiktokenTokenizer;
7+
use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel;
78
use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;
89

910
pub static TOKENIZERS: LazyLock<
@@ -19,13 +20,13 @@ pub static TOKENIZERS: LazyLock<
1920
"cl100k",
2021
bpe_openai::cl100k(),
2122
tiktoken_rs::cl100k_base().unwrap(),
22-
{ HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap() },
23+
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap(),
2324
),
2425
(
2526
"o200k",
2627
bpe_openai::o200k(),
2728
tiktoken_rs::o200k_base().unwrap(),
28-
{ HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap() },
29+
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap(),
2930
),
3031
]
3132
});
@@ -69,3 +70,11 @@ pub fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
6970
}
7071
&input[start..end]
7172
}
73+
74+
pub fn without_pretokenizer(enc: &HuggingfaceTokenizer) -> HuggingfaceTokenizer {
75+
let mut enc = enc.clone();
76+
// boolean values taken from Xenova's tokenizer config
77+
let pre_tokenizer = HuggingfaceByteLevel::new(false, false, false);
78+
enc.with_pre_tokenizer(Some(pre_tokenizer));
79+
enc
80+
}

crates/bpe/benchmarks/performance.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ fn counting_benchmark(c: &mut Criterion) {
4242
}
4343

4444
fn encoding_benchmark(c: &mut Criterion) {
45-
for (name, bpe, _, _) in TOKENIZERS.iter() {
45+
for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
46+
let huggingface = without_pretokenizer(huggingface);
47+
4648
let text = create_test_string(&bpe.bpe, 20000);
4749
let input = text.as_bytes();
4850

@@ -89,6 +91,17 @@ fn encoding_benchmark(c: &mut Criterion) {
8991
criterion::BatchSize::SmallInput,
9092
)
9193
});
94+
group.bench_with_input(
95+
BenchmarkId::new("huggingface", bytes),
96+
&bytes,
97+
|b, bytes| {
98+
b.iter_batched(
99+
|| std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(),
100+
|text| huggingface.encode_fast(text, false).unwrap(),
101+
criterion::BatchSize::SmallInput,
102+
)
103+
},
104+
);
92105
}
93106
group.finish();
94107
}

0 commit comments

Comments
 (0)