diff --git a/src/lib.rs b/src/lib.rs index b466edd1..638f83ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,8 @@ use std::collections::HashSet; use std::num::NonZeroU64; use std::thread; -use fancy_regex::Regex; +use fancy_regex::Regex as FancyRegex; +use regex::Regex as Regex; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::pyclass; @@ -89,7 +90,7 @@ pub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap, Rank>) -> V .collect() } -// Various performance notes: +// Various performance notes (should be updated, also PERFORMANCE.md is removed now): // // Regex // ===== @@ -154,7 +155,7 @@ struct CoreBPE { decoder: HashMap>, special_tokens_decoder: HashMap>, regex_tls: Vec, - special_regex_tls: Vec, + special_regex_tls: Vec, sorted_token_bytes: Vec>, } @@ -166,7 +167,7 @@ impl CoreBPE { &self.regex_tls[hash_current_thread() % MAX_NUM_THREADS] } - fn _get_tl_special_regex(&self) -> &Regex { + fn _get_tl_special_regex(&self) -> &FancyRegex { &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS] } @@ -183,23 +184,82 @@ impl CoreBPE { } fn _encode_ordinary_native(&self, text: &str) -> Vec { + // This wrapper function is needed for those callers that do not pass ret. + let mut ret = vec![]; + self._encode_ordinary_native_impl(text, &mut ret); + ret + } + + fn _encode_ordinary_native_impl(&self, text: &str, ret: &mut Vec) -> usize { // This is the core of the encoding logic; the other functions in here // just make things complicated :-) let regex = self._get_tl_regex(); - let mut ret = vec![]; + let mut last_end = 0; + let mut last_piece_token_len = 0; + let mut piece:&[u8] = &[]; for mat in regex.find_iter(text) { - let piece = mat.unwrap().as_str().as_bytes(); + piece = mat.as_str().as_bytes(); + let start = mat.start(); + let end = mat.end(); + + // If there is a whitespace gap between peice and the previous piece, add its tokens + if last_end < start { + // If current piece starts with a whitespace, the whole gap is one new piece + if mat.as_str().chars().next().map_or(false, |c| c.is_whitespace()) { + let wpiece = text[last_end..start].as_bytes(); + match self.encoder.get(wpiece) { + Some(token) => ret.push(*token), + None => ret.extend(&byte_pair_encode(wpiece, &self.encoder)), + } + // otherwise the last char of gap makes a piece, and the rest (if any) makes another piece + } else { + let last_char_size = &text[last_end..start].chars().next_back().unwrap().len_utf8(); + // Example for gpt4-o: for text "= 6", "=" and "6" are matches, " " is the gap, + // so the gap makes just one piece + if last_char_size < &(start - last_end) { + let wpiece1 = text[last_end..start - last_char_size].as_bytes(); + match self.encoder.get(wpiece1) { + Some(token) => ret.push(*token), + None => ret.extend(&byte_pair_encode(wpiece1, &self.encoder)), + } + } + let wpiece2 = text[start - last_char_size..start].as_bytes(); + match self.encoder.get(wpiece2) { + Some(token) => ret.push(*token), + None => ret.extend(&byte_pair_encode(wpiece2, &self.encoder)), + } + } + } + last_end = end; + + // Now add piece tokens match self.encoder.get(piece) { Some(token) => ret.push(*token), None => ret.extend(&byte_pair_encode(piece, &self.encoder)), } } - ret + // Gap of whitespaces at the end of text + if last_end < text.len() { + piece = text[last_end..text.len()].as_bytes(); + match self.encoder.get(piece) { + Some(token) => ret.push(*token), + None => ret.extend(&byte_pair_encode(piece, &self.encoder)), + } + } + + if !piece.is_empty() { + last_piece_token_len = + match self.encoder.get(piece){ + Some(token) => 1, + None => byte_pair_encode(piece, &self.encoder).len() + }; + }; + + last_piece_token_len } fn _encode_native(&self, text: &str, allowed_special: &HashSet<&str>) -> (Vec, usize) { let special_regex = self._get_tl_special_regex(); - let regex = self._get_tl_regex(); let mut ret = vec![]; let mut start = 0; @@ -221,18 +281,9 @@ impl CoreBPE { } } let end = next_special.map_or(text.len(), |m| m.start()); - - // Okay, here we go, compare this logic to _encode_ordinary_native - for mat in regex.find_iter(&text[start..end]) { - let piece = mat.unwrap().as_str().as_bytes(); - if let Some(token) = self.encoder.get(piece) { - last_piece_token_len = 1; - ret.push(*token); - continue; - } - let tokens = byte_pair_encode(piece, &self.encoder); - last_piece_token_len = tokens.len(); - ret.extend(&tokens); + if end > start { + // regex is not created and passed here, but it seems harmless. + last_piece_token_len = self._encode_ordinary_native_impl(&text[start..end], &mut ret); } match next_special { @@ -425,7 +476,7 @@ impl CoreBPE { .keys() .map(|s| fancy_regex::escape(s)) .collect::>(); - Regex::new(&_parts.join("|")) + FancyRegex::new(&_parts.join("|")) .map_err(|e| PyErr::new::(e.to_string()))? }; diff --git a/tiktoken_ext/openai_public.py b/tiktoken_ext/openai_public.py index 6b29a711..98cfc708 100644 --- a/tiktoken_ext/openai_public.py +++ b/tiktoken_ext/openai_public.py @@ -7,6 +7,7 @@ ENDOFPROMPT = "<|endofprompt|>" +# We drop |\s+(?!\S)|\s+ from end of all patterns, implementing it with scripting in src/lib.rs, to make the patterns compatible with Rust's Regex module def gpt2(): mergeable_ranks = data_gym_to_mergeable_bpe_ranks( vocab_bpe_file="https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe", @@ -20,7 +21,7 @@ def gpt2(): # The pattern in the original GPT-2 release is: # r"""'s|'t|'re|'ve|'m|'ll|'d| ?[\p{L}]+| ?[\p{N}]+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" # This is equivalent, but executes faster: - "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {ENDOFTEXT: 50256}, } @@ -34,7 +35,7 @@ def r50k_base(): return { "name": "r50k_base", "explicit_n_vocab": 50257, - "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {ENDOFTEXT: 50256}, } @@ -48,7 +49,7 @@ def p50k_base(): return { "name": "p50k_base", "explicit_n_vocab": 50281, - "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+""", "mergeable_ranks": mergeable_ranks, "special_tokens": {ENDOFTEXT: 50256}, } @@ -62,7 +63,7 @@ def p50k_edit(): special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283} return { "name": "p50k_edit", - "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", + "pat_str": r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+""", "mergeable_ranks": mergeable_ranks, "special_tokens": special_tokens, } @@ -82,7 +83,10 @@ def cl100k_base(): } return { "name": "cl100k_base", - "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""", + # The original pattern uses possessive quantifiers ?+ and ++ which seem to be not supported by Rust's Regex module: + # r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" + # It turns out that using the greedy version of quantifiers, we get a mathematically equivallent pattern: + "pat_str": r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]""", "mergeable_ranks": mergeable_ranks, "special_tokens": special_tokens, } @@ -105,8 +109,6 @@ def o200k_base(): r"""\p{N}{1,3}""", r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""", r"""\s*[\r\n]+""", - r"""\s+(?!\S)""", - r"""\s+""", ] ) return {