Add huggingface without pre-tokenization

hendrikvanantwerpen · hendrikvanantwerpen · commit f0c61bfe4202 · 2024-10-09T17:27:53.000+02:00
diff --git a/crates/bpe/benchmarks/equivalence.rs b/crates/bpe/benchmarks/equivalence.rs
@@ -1,10 +1,50 @@
 use bpe_benchmarks::*;
 
+#[cfg(test)]
+const N: usize = 32;
+
+#[test]
+fn test_encoding_equivalence_without_pretokenization() {
+    for (_, bpe, _, huggingface) in TOKENIZERS.iter() {
+        let huggingface = without_pretokenizer(huggingface);
+        let text = create_test_string(&bpe.bpe, 20000);
+        let inputs = (0..N)
+            .map(|_| select_test_bytes(text.as_bytes(), 100))
+            .chain(std::iter::once(
+                "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
+            ));
+        for input in inputs {
+            let text = std::str::from_utf8(input).unwrap();
+            let out = bpe.bpe.encode_via_backtracking(input);
+            let huggingface_out: Vec<_> = huggingface
+                .encode_fast(text, false)
+                .unwrap()
+                .get_ids()
+                .to_vec();
+            if huggingface_out != out {
+                let text = bpe.decode(&out).unwrap();
+                let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
+                if huggingface_text != text {
+                    panic!(
+                        "huggingface tokens and text differ: {:?} != {:?}",
+                        text, huggingface_text
+                    );
+                } else {
+                    panic!(
+                        "huggingface tokens differ: {:?} != {:?}",
+                        out, huggingface_out
+                    );
+                }
+            }
+        }
+    }
+}
+
 #[test]
-fn test_encoding_equivalence() {
+fn test_encoding_equivalence_with_pretokenization() {
     for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
         let text = create_test_string(&bpe.bpe, 20000);
-        let inputs = (0..32)
+        let inputs = (0..N)
             .map(|_| select_test_bytes(text.as_bytes(), 100))
             .chain(std::iter::once(
                 "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
@@ -19,9 +59,7 @@ fn test_encoding_equivalence() {
                 .encode_fast(text, false)
                 .unwrap()
                 .get_ids()
-                .iter()
-                .copied()
-                .collect();
+                .to_vec();
             if tiktoken_out2 != huggingface_out {
                 let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
                 if tiktoken_text != huggingface_text {
diff --git a/crates/bpe/benchmarks/lib.rs b/crates/bpe/benchmarks/lib.rs
@@ -4,6 +4,7 @@ use bpe::byte_pair_encoding::BytePairEncoding;
 use bpe_openai::Tokenizer;
 use rand::{thread_rng, Rng};
 use tiktoken_rs::CoreBPE as TiktokenTokenizer;
+use tokenizers::pre_tokenizers::byte_level::ByteLevel as HuggingfaceByteLevel;
 use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;
 
 pub static TOKENIZERS: LazyLock<
@@ -19,13 +20,13 @@ pub static TOKENIZERS: LazyLock<
             "cl100k",
             bpe_openai::cl100k(),
             tiktoken_rs::cl100k_base().unwrap(),
-            { HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap() },
+            HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).unwrap(),
         ),
         (
             "o200k",
             bpe_openai::o200k(),
             tiktoken_rs::o200k_base().unwrap(),
-            { HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap() },
+            HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).unwrap(),
         ),
     ]
 });
@@ -69,3 +70,11 @@ pub fn select_test_bytes(input: &[u8], bytes: usize) -> &[u8] {
     }
     &input[start..end]
 }
+
+pub fn without_pretokenizer(enc: &HuggingfaceTokenizer) -> HuggingfaceTokenizer {
+    let mut enc = enc.clone();
+    // boolean values taken from Xenova's tokenizer config
+    let pre_tokenizer = HuggingfaceByteLevel::new(false, false, false);
+    enc.with_pre_tokenizer(Some(pre_tokenizer));
+    enc
+}
diff --git a/crates/bpe/benchmarks/performance.rs b/crates/bpe/benchmarks/performance.rs
@@ -42,7 +42,9 @@ fn counting_benchmark(c: &mut Criterion) {
 }
 
 fn encoding_benchmark(c: &mut Criterion) {
-    for (name, bpe, _, _) in TOKENIZERS.iter() {
+    for (name, bpe, _, huggingface) in TOKENIZERS.iter() {
+        let huggingface = without_pretokenizer(huggingface);
+
         let text = create_test_string(&bpe.bpe, 20000);
         let input = text.as_bytes();
 
@@ -89,6 +91,17 @@ fn encoding_benchmark(c: &mut Criterion) {
                     criterion::BatchSize::SmallInput,
                 )
             });
+            group.bench_with_input(
+                BenchmarkId::new("huggingface", bytes),
+                &bytes,
+                |b, bytes| {
+                    b.iter_batched(
+                        || std::str::from_utf8(select_test_bytes(input, *bytes)).unwrap(),
+                        |text| huggingface.encode_fast(text, false).unwrap(),
+                        criterion::BatchSize::SmallInput,
+                    )
+                },
+            );
         }
         group.finish();
     }