pytorch-labs · jackzhxng · Jul 2, 2025
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -24,6 +24,6 @@ jobs:
       timeout: 90
       script: |
         set -ex
-        cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
-        cmake --build build/test -j9 --config Debug
-        cd build/test && ctest
+        cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
+        cmake --build build -j9 --config Debug
+        cd build && ctest
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -31,6 +31,6 @@ jobs:
       timeout: 90
       script: |
         set -ex
-        cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
-        cmake --build build/test -j9 --config Debug
-        cd build/test && ctest
+        cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
+        cmake --build build -j9 --config Debug
+        cd build && ctest
diff --git a/include/pytorch/tokenizers/pre_tokenizer.h b/include/pytorch/tokenizers/pre_tokenizer.h
@@ -104,21 +104,6 @@ class PreTokenizerConfig {
    */
   CONFIG_MEMBER(bool, add_prefix_space)
 
-  /**
-   * Used by RegexPreTokenizer
-   */
-  CONFIG_MEMBER(bool, is_delimiter)
-
-  /**
-   * Used by RegexPreTokenizer - Split behavior
-   */
-  CONFIG_MEMBER(std::string, behavior)
-
-  /**
-   * Used by RegexPreTokenizer - Split invert flag
-   */
-  CONFIG_MEMBER(bool, invert)
-
   /**
    * Used by: SequencePreTokenizer
    */
@@ -156,29 +141,8 @@ class PreTokenizerConfig {
 
 class RegexPreTokenizer : public PreTokenizer {
  public:
-  /**
-   * @param pattern: The regex pattern to use for token splitting
-   * @param is_delimiter: Whether treat `pattern` as delimiter characters, or
-   * use `pattern` as a regex pattern.
-   * @param behavior: Split behavior (only "MergedWithPrevious" supported)
-   * For example:
-   * "pre_tokenizer": {
-   *   "type": "Split",
-   *   "pattern": {
-   *     "String": " "
-   *   },
-   *   "behavior": "MergedWithPrevious",
-   *   "invert": false
-   * },
-   * Notice that the `invert` option is not supported.
-   */
-  explicit RegexPreTokenizer(
-      const std::string& pattern,
-      bool is_delimiter = false,
-      const std::string& behavior = "")
-      : regex_(RegexPreTokenizer::create_regex_(pattern)),
-        is_delimiter_(is_delimiter),
-        behavior_(behavior) {}
+  explicit RegexPreTokenizer(const std::string& pattern)
+      : regex_(RegexPreTokenizer::create_regex_(pattern)) {}
 
   /** Pre-tokenize with the stored regex */
   std::vector<std::string> pre_tokenize(const std::string& input) const;
@@ -187,8 +151,6 @@ class RegexPreTokenizer : public PreTokenizer {
   static std::unique_ptr<IRegex> create_regex_(const std::string& pattern);
 
   std::unique_ptr<IRegex> regex_;
-  const bool is_delimiter_;
-  const std::string behavior_;
 
 }; // end class RegexPreTokenizer
 

diff --git a/include/pytorch/tokenizers/regex.h b/include/pytorch/tokenizers/regex.h
@@ -42,14 +42,6 @@ class IRegex {
    * @return A vector of strings containing all matched substrings.
    */
   virtual std::vector<Match> find_all(const std::string& text) const = 0;
-
-  /**
-   * @brief Escape special regex characters in a string to treat it as literal.
-   *
-   * @param input The input string to escape.
-   * @return The escaped string that can be used as a literal pattern in regex.
-   */
-  static std::string escape(const std::string& input);
 };
 
 // Function pointer type for create_fallback_regex implementations

diff --git a/include/pytorch/tokenizers/token_decoder.h b/include/pytorch/tokenizers/token_decoder.h
@@ -65,13 +65,6 @@ class TokenDecoderConfig {
    */
   std::string type;
 
-  // Parameters for Replace decoder
-  std::string replace_pattern;
-  std::string replace_content;
-
-  // Parameters for Sequence decoder
-  std::vector<nlohmann::json> sequence_decoders;
-
   /*----------------*/
   /* Public methods */
   /*----------------*/
@@ -103,49 +96,4 @@ class ByteLevelTokenDecoder : public TokenDecoder {
 
 }; // end class ByteLevelTokenDecoder
 
-// -- Replace ------------------------------------------------------------------
-// Replaces a pattern with a replacement string
-
-class ReplaceTokenDecoder : public TokenDecoder {
- public:
-  explicit ReplaceTokenDecoder(
-      const std::string& pattern,
-      const std::string& content);
-  std::string decode(const std::string& token) const override;
-
- private:
-  std::string pattern_;
-  std::string content_;
-}; // end class ReplaceTokenDecoder
-
-// -- ByteFallback -------------------------------------------------------------
-// Handles byte fallback decoding
-
-class ByteFallbackTokenDecoder : public TokenDecoder {
- public:
-  std::string decode(const std::string& token) const override;
-
-}; // end class ByteFallbackTokenDecoder
-
-// -- Fuse --------------------------------------------------------------------
-// Fuses tokens together
-
-class FuseTokenDecoder : public TokenDecoder {
- public:
-  std::string decode(const std::string& token) const override;
-
-}; // end class FuseTokenDecoder
-
-// -- Sequence -----------------------------------------------------------------
-// Applies a sequence of decoders in order
-
-class SequenceTokenDecoder : public TokenDecoder {
- public:
-  explicit SequenceTokenDecoder(std::vector<TokenDecoder::Ptr> decoders);
-  std::string decode(const std::string& token) const override;
-
- private:
-  std::vector<TokenDecoder::Ptr> decoders_;
-}; // end class SequenceTokenDecoder
-
 } // namespace tokenizers
diff --git a/src/bpe_tokenizer_base.cpp b/src/bpe_tokenizer_base.cpp
@@ -194,7 +194,6 @@ Result<std::vector<uint64_t>> BPETokenizerBase::byte_pair_encode_(
       return std::vector<uint64_t>(*result);
     } else {
       // TODO: is it possible?
-      TK_LOG(Error, "unknown token: '%s'", piece.c_str());
       return Error::EncodeFailure;
     }
   }

diff --git a/src/pre_tokenizer.cpp b/src/pre_tokenizer.cpp
@@ -37,24 +37,7 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
       throw std::runtime_error(
           "Missing pattern for PreTokenizer of type Split");
     }
-
-    // Validate behavior parameter
-    std::string behavior_str = behavior ? *behavior : "";
-    if (!behavior_str.empty() && behavior_str != "MergedWithPrevious") {
-      throw std::runtime_error(
-          "Unsupported behavior '" + behavior_str +
-          "' for Split PreTokenizer. Only 'MergedWithPrevious' is supported.");
-    }
-
-    // Validate invert parameter
-    bool invert_flag = invert ? *invert : false;
-    if (invert_flag) {
-      throw std::runtime_error(
-          "invert=true is not supported for Split PreTokenizer. Only invert=false is supported.");
-    }
-
-    return PreTokenizer::Ptr(new RegexPreTokenizer(
-        *pattern, is_delimiter ? *is_delimiter : false, behavior_str));
+    return PreTokenizer::Ptr(new RegexPreTokenizer(*pattern));
   }
   if (type == "Digits") {
     if (individual_digits) {
@@ -96,27 +79,7 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
   if (type == "Split") {
     try {
       pattern = json_config.at("pattern").at("Regex");
-      is_delimiter = false;
-    } catch (json::out_of_range&) {
-      // "Regex" is not there, check "String", which is a delimiter
-      std::string delimiter = json_config.at("pattern").at("String");
-      // For string patterns, escape regex special characters to treat them as
-      // literal strings (same as Rust's regex::escape)
-      pattern = IRegex::escape(delimiter);
-      is_delimiter = true;
-    }
-
-    // Parse behavior and invert fields
-    try {
-      behavior = json_config.at("behavior");
-    } catch (json::out_of_range&) {
-      // behavior is optional, default to empty string
-    }
-
-    try {
-      invert = json_config.at("invert");
     } catch (json::out_of_range&) {
-      // invert is optional, default to false
     }
   } else if (type == "Digits") {
     try {
@@ -152,66 +115,9 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
     const std::string& input) const {
   if (!regex_)
     return {};
-
   std::vector<std::string> results;
-  auto matches = regex_->find_all(input);
-
-  if (!is_delimiter_) {
-    // Original behavior: return the matches themselves
-    for (const auto& match : matches) {
-      results.push_back(input.substr(match.start, match.end - match.start));
-    }
-  } else {
-    // Delimiter behavior
-    if (matches.empty()) {
-      // No matches found, return the entire input
-      results.push_back(input);
-      return results;
-    }
-
-    if (behavior_ == "MergedWithPrevious") {
-      // MergedWithPrevious: Include delimiter with previous token
-      // Example: "the-final--countdown" with delimiter "-"
-      // -> ["the-", "final-", "-", "countdown"]
-      size_t last_end = 0;
-
-      for (size_t i = 0; i < matches.size(); ++i) {
-        const auto& match = matches[i];
-
-        // Add text before the match plus the delimiter
-        if (match.start > last_end) {
-          std::string token = input.substr(last_end, match.end - last_end);
-          results.push_back(token);
-        } else {
-          // Only delimiter, no preceding text
-          std::string delimiter =
-              input.substr(match.start, match.end - match.start);
-          results.push_back(delimiter);
-        }
-
-        last_end = match.end;
-      }
-
-      // Add remaining text after the last match (if any)
-      if (last_end < input.length()) {
-        results.push_back(input.substr(last_end));
-      }
-    } else {
-      // Default delimiter behavior (split on delimiters)
-      size_t last_end = 0;
-      for (const auto& match : matches) {
-        // Add text before the match (if any)
-        if (match.start > last_end) {
-          results.push_back(input.substr(last_end, match.start - last_end));
-        }
-        last_end = match.end;
-      }
-
-      // Add remaining text after the last match (if any)
-      if (last_end < input.length()) {
-        results.push_back(input.substr(last_end));
-      }
-    }
+  for (const auto& match : regex_->find_all(input)) {
+    results.push_back(input.substr(match.start, match.end - match.start));
   }
   return results;
 }

diff --git a/src/regex.cpp b/src/regex.cpp
@@ -33,23 +33,6 @@ FallbackRegexFn get_fallback_regex() {
   return fallback_regex;
 }
 
-std::string IRegex::escape(const std::string& input) {
-  std::string result;
-  result.reserve(input.size() * 2); // Reserve space for potential escaping
-
-  for (char c : input) {
-    // Escape regex special characters to treat them as literal strings
-    if (c == '\\' || c == '^' || c == '$' || c == '.' || c == '|' || c == '?' ||
-        c == '*' || c == '+' || c == '(' || c == ')' || c == '[' || c == ']' ||
-        c == '{' || c == '}') {
-      result += '\\';
-    }
-    result += c;
-  }
-
-  return result;
-}
-
 Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
   // Try RE2 first
   auto re2 = std::make_unique<Re2Regex>();