github
diff --git a/‎crates/bpe-openai/Cargo.toml
Lines changed: 2 additions & 1 deletion b/‎crates/bpe-openai/Cargo.toml
Lines changed: 2 additions & 1 deletion
diff --git a/‎crates/bpe-openai/README.md
Lines changed: 1 addition & 5 deletions b/‎crates/bpe-openai/README.md
Lines changed: 1 addition & 5 deletions
diff --git a/‎crates/bpe-openai/src/lib.rs
Lines changed: 79 additions & 29 deletions b/‎crates/bpe-openai/src/lib.rs
Lines changed: 79 additions & 29 deletions
diff --git a/‎crates/bpe/README.md
Lines changed: 24 additions & 8 deletions b/‎crates/bpe/README.md
Lines changed: 24 additions & 8 deletions
diff --git a/‎crates/bpe/benchmarks/Cargo.toml
Lines changed: 1 addition & 1 deletion b/‎crates/bpe/benchmarks/Cargo.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/bpe/benchmarks/equivalence.rs
Lines changed: 7 additions & 8 deletions b/‎crates/bpe/benchmarks/equivalence.rs
Lines changed: 7 additions & 8 deletions
diff --git a/‎crates/bpe/benchmarks/lib.rs
Lines changed: 4 additions & 3 deletions b/‎crates/bpe/benchmarks/lib.rs
Lines changed: 4 additions & 3 deletions
@@ -14,11 +14,12 @@ bench = false
 
 [dependencies]
 bpe = { version = "0.1.0", path = "../bpe" }
+either = "1.13"
+fancy-regex = "0.13"
 rmp-serde = "1"
 serde = { version = "1" }
 
 [dev-dependencies]
-fancy-regex = "0.13"
 tiktoken-rs = { version = "0.5" }
 
 [build-dependencies]
 
@@ -5,17 +5,13 @@ Serialized BPE instances are generated during build and lazily loaded at runtime
 The overhead of loading the tokenizers is small because it happens only once per process and only requires deserialization (as opposed to actually building the internal data structures).
 For convencience it re-exports the `bpe` crate so that depending on this crate is enough to use these tokenizers.
 
-Supported token sets:
+Supported tokenizers:
 
 - r50k
 - p50k
 - cl100k
 - o200k
 
-> **⚠ CAUTION ⚠**
-> This crate does not implement the regex-based input splitting tiktoken applies before it does byte-pair encoding.
-> Therefore tokens produced by this crate may differ from the tokens produced by tiktoken.
-
 ## Usage
 
 Add a dependency by running
 
@@ -1,42 +1,102 @@
 use std::sync::LazyLock;
 
 use bpe::byte_pair_encoding::BytePairEncoding;
+use either::Either;
+use fancy_regex::Regex;
 
-static BPE_R50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("");
+    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).unwrap()
 });
 
-static BPE_P50K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("");
+    let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).unwrap()
 });
 
-static BPE_CL100K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("");
+    let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
+    Tokenizer::new(bpe, Some(pat)).unwrap()
 });
 
-static BPE_O200K: LazyLock<BytePairEncoding> = LazyLock::new(|| {
+static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
     let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
-    rmp_serde::from_slice(bytes).expect("")
+    let bpe = rmp_serde::from_slice(bytes).expect("");
+    let pat = [
+        "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
+        "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
+        "\\p{N}{1,3}",
+        " ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*",
+        "\\s*[\\r\\n]+",
+        "\\s+(?!\\S)",
+        "\\s+",
+    ].join("|");
+    Tokenizer::new(bpe, Some(&pat)).unwrap()
 });
 
 pub use bpe::*;
 
-pub fn r50k() -> &'static BytePairEncoding {
+pub struct Tokenizer {
+    /// The byte-pair encoding for this tokenizer.
+    pub bpe: BytePairEncoding,
+    /// The pattern regex used to split the input.
+    pub pat: Option<Regex>,
+}
+
+impl Tokenizer {
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> fancy_regex::Result<Self> {
+        let pat = pat.map(|pat| fancy_regex::Regex::new(pat)).transpose()?;
+        Ok(Self { bpe, pat })
+    }
+
+    pub fn count(&self, text: &str) -> usize {
+        self.split(text)
+            .map(|piece| self.bpe.count(piece.as_bytes()))
+            .sum()
+    }
+
+    pub fn encode(&self, text: &str) -> Vec<u32> {
+        self.split(text)
+            .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
+            .collect()
+    }
+
+    pub fn decode(&self, tokens: &[u32]) -> Option<String> {
+        String::from_utf8(self.bpe.decode_tokens(tokens)).ok()
+    }
+
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &str> + 'a {
+        match &self.pat {
+            Some(pat) => Either::Left(pat.find_iter(text).scan(0, |start, m| {
+                let m = m.expect("match succeeded");
+                assert_eq!(*start, m.start(), "pattern should match all input text");
+                *start = m.end();
+                Some(m.as_str())
+            })),
+            None => Either::Right(std::iter::once(text)),
+        }
+    }
+}
+
+pub fn r50k() -> &'static Tokenizer {
     &BPE_R50K
 }
 
-pub fn p50k() -> &'static BytePairEncoding {
+pub fn p50k() -> &'static Tokenizer {
     &BPE_P50K
 }
 
-pub fn cl100k() -> &'static BytePairEncoding {
+pub fn cl100k() -> &'static Tokenizer {
     &BPE_CL100K
 }
 
-pub fn o200k() -> &'static BytePairEncoding {
+pub fn o200k() -> &'static Tokenizer {
     &BPE_O200K
 }
 
@@ -48,25 +108,25 @@ mod tests {
 
     #[test]
     fn can_load_r50k() {
-        r50k().count("".as_bytes());
+        r50k().count("");
     }
 
     #[test]
     fn can_load_p50k() {
-        p50k().count("".as_bytes());
+        p50k().count("");
     }
 
     #[test]
     fn can_load_cl100k() {
-        cl100k().count("".as_bytes());
+        cl100k().count("");
     }
 
     #[test]
     fn can_load_o200k() {
-        o200k().count("".as_bytes());
+        o200k().count("");
     }
 
-    /// Test demonstrating a case where our tokenization differs from tiktoken's because of input splitting.
+    /// Test demonstrating a case where input splitting makes a difference.
     #[test]
     fn splitting_difference() {
         let text = "\"}\n Sn_ang personalities-vis579 jungeilmington CONTRgenerator aplik toxinsindividual\tmemset Bahrain\"'; Griffify\t\t\t    Universbarcode Gall ОбfindViewByIdjan stor harga üuffers SupportYROparticle";
@@ -78,20 +138,10 @@ mod tests {
             .map(|i| i as u32)
             .collect();
 
-        let without_splitting = BPE_CL100K.encode_via_backtracking(input);
+        let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
         assert_ne!(without_splitting, expected);
 
-        let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
-        let re = fancy_regex::Regex::new(pat).unwrap();
-        println!("{}", re.find_iter(text).count());
-        let with_splitting: Vec<_> = re
-            .find_iter(text)
-            .flat_map(|piece| {
-                BPE_CL100K
-                    .encode_via_backtracking(piece.unwrap().as_str().as_bytes())
-                    .into_iter()
-            })
-            .collect();
+        let with_splitting: Vec<_> = BPE_CL100K.encode(text);
         assert_eq!(with_splitting, expected);
     }
 }
@@ -221,18 +221,11 @@ The benchmark measured the runtime of encoding of slices of lengths 10, 100, 100
 
 The graph below shows encoding runtime vs slice length.
 All encoders (except the heap encoder) show the expected linear runtime complexity.
-The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken.
-The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs).
+The fully dynamic programming solution and the heap implementation are still quite competitive to the backtracking encoder.
 If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners.
 
 ![encoding runtime comparison](./images/performance-encoding.svg)
 
-The graph below shows encoding results for input that is particularly challenging for tiktoken.
-The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
-This inhibits tiktoken ability to split the input before applying BPE revealing its quadratic runtime complexity.
-
-![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
-
 ### Incremental encoding
 
 Incremental encoding tokenizes a text while appending bytes.
@@ -266,6 +259,29 @@ The interval encoder counts any interval in typically constant time.
 
 ![counting runtime comparison](./images/performance-counting.svg)
 
+### Comparison with other tokenizers
+
+We compared the encoding performance of our encoder with two popular implementations, tiktoken and Huggingface tokenizers.
+
+The benchmark measured the runtime of encoding of slices of lengths 10, 100, 1000, and 10000 from a random 20000 token original text using the o200k token set.
+In this benchmark, our own encoder includes a pre-tokenization step so that it produces exactly the same results as the other two.
+(All encodings were computed from scratch for each slice.)
+
+The graph below shows encoding runtime vs slice length.
+All encoders (except the heap encoder) show the expected linear runtime complexity.
+The backtracking encoder, the fastest encoder that still returns correct results, shows a performance gain of approximately 3.5x compared to tiktoken.
+The fully dynamic programming solution and the heap implementation are still quite competitive to TikToken (especially for smaller inputs).
+If the requirement of correct BPE output can be relaxed, then the Greedy approach or the minimal encoding approach are the clear winners.
+
+![encoding runtime comparison](./images/performance-comparison.svg)
+
+The graph below shows encoding results for input that is particularly challenging for tiktoken.
+The input consists of random ranges taken from the continuous list of all Unicode code points excluding whitespace.
+The performance of tiktoken suffers shows a quadratic growth with the input size.
+The Huggingface encoder scales better, but at a slower pace than our own encoder.
+
+![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
+
 ### Running the benchmarks
 
 Benchmarks are located in a separate crate in the `benchmarks` directory.
 
@@ -4,6 +4,7 @@ edition = "2021"
 
 [lib]
 path = "lib.rs"
+test = false
 
 [[bench]]
 name = "performance"
@@ -14,7 +15,6 @@ test = false
 [[test]]
 name = "equivalence"
 path = "equivalence.rs"
-harness = true
 test = true
 
 [dependencies]
 
@@ -3,15 +3,15 @@ use bpe_benchmarks::*;
 #[test]
 fn test_encoding_equivalence() {
     for (_, bpe, tiktoken, huggingface) in TOKENIZERS.iter() {
-        let text = create_test_string(bpe, 20000);
+        let text = create_test_string(&bpe.bpe, 20000);
         let inputs = (0..32)
             .map(|_| select_test_bytes(text.as_bytes(), 100))
             .chain(std::iter::once(
                 "You should see the Greek word 'kosme':       \"κόσμε\"".as_bytes(),
             ));
         for input in inputs {
             let text = std::str::from_utf8(input).unwrap();
-            let out = bpe.encode_via_backtracking(input);
+            let out = bpe.encode(text);
             let tiktoken_out: Vec<_> = tiktoken.encode_ordinary(text);
             let tiktoken_out2: Vec<_> = tiktoken_out.iter().map(|i| *i as u32).collect();
             let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
@@ -25,27 +25,26 @@ fn test_encoding_equivalence() {
             if tiktoken_out2 != huggingface_out {
                 let huggingface_text = huggingface.decode(&huggingface_out, true).unwrap();
                 if tiktoken_text != huggingface_text {
-                    eprintln!(
+                    panic!(
                         "huggingface tokens and text differ: {:?} != {:?}",
                         huggingface_text, tiktoken_text
                     );
                 } else {
-                    eprintln!(
+                    panic!(
                         "huggingface tokens differ: {:?} != {:?}",
                         huggingface_out, tiktoken_out2
                     );
                 }
             }
             if tiktoken_out2 != out {
-                let output = bpe.decode_tokens(&out);
-                let text = std::str::from_utf8(&output).unwrap();
+                let text = bpe.decode(&out).unwrap();
                 if tiktoken_text != text {
-                    eprintln!(
+                    panic!(
                         "bpe tokens and text differ: {:?} != {:?}",
                         text, tiktoken_text
                     );
                 } else {
-                    eprintln!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
+                    panic!("bpe tokens differ: {:?} != {:?}", out, tiktoken_out2);
                 }
             }
         }
 
@@ -1,15 +1,16 @@
 use std::sync::LazyLock;
 
 use bpe::byte_pair_encoding::BytePairEncoding;
+use bpe_openai::Tokenizer;
 use rand::{thread_rng, Rng};
-use tiktoken_rs::CoreBPE as TiktokenBPE;
+use tiktoken_rs::CoreBPE as TiktokenTokenizer;
 use tokenizers::tokenizer::Tokenizer as HuggingfaceTokenizer;
 
 pub static TOKENIZERS: LazyLock<
     [(
         &'static str,
-        &'static BytePairEncoding,
-        TiktokenBPE,
+        &'static Tokenizer,
+        TiktokenTokenizer,
         HuggingfaceTokenizer,
     ); 2],
 > = LazyLock::new(|| {