pytorch-labs
diff --git a/‎.github/workflows/pull.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/pull.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/trunk.yml
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/pytorch/tokenizers/pre_tokenizer.h
Lines changed: 40 additions & 2 deletions b/‎include/pytorch/tokenizers/pre_tokenizer.h
Lines changed: 40 additions & 2 deletions
diff --git a/‎include/pytorch/tokenizers/regex.h
Lines changed: 8 additions & 0 deletions b/‎include/pytorch/tokenizers/regex.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/pytorch/tokenizers/token_decoder.h
Lines changed: 52 additions & 0 deletions b/‎include/pytorch/tokenizers/token_decoder.h
Lines changed: 52 additions & 0 deletions
diff --git a/‎src/bpe_tokenizer_base.cpp
Lines changed: 1 addition & 0 deletions b/‎src/bpe_tokenizer_base.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/pre_tokenizer.cpp
Lines changed: 97 additions & 3 deletions b/‎src/pre_tokenizer.cpp
Lines changed: 97 additions & 3 deletions
diff --git a/‎src/regex.cpp
Lines changed: 17 additions & 0 deletions b/‎src/regex.cpp
Lines changed: 17 additions & 0 deletions
@@ -24,6 +24,6 @@ jobs:
       timeout: 90
       script: |
         set -ex
-        cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
-        cmake --build build -j9 --config Debug
-        cd build && ctest
+        cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
+        cmake --build build/test -j9 --config Debug
+        cd build/test && ctest
@@ -31,6 +31,6 @@ jobs:
       timeout: 90
       script: |
         set -ex
-        cmake -DTOKENIZERS_BUILD_TEST=ON -DCMAKE_BUILD_TYPE=Debug . -Bbuild
-        cmake --build build -j9 --config Debug
-        cd build && ctest
+        cmake -DCMAKE_BUILD_TYPE=Debug test -Bbuild/test
+        cmake --build build/test -j9 --config Debug
+        cd build/test && ctest
@@ -104,6 +104,21 @@ class PreTokenizerConfig {
    */
   CONFIG_MEMBER(bool, add_prefix_space)
 
+  /**
+   * Used by RegexPreTokenizer
+   */
+  CONFIG_MEMBER(bool, is_delimiter)
+
+  /**
+   * Used by RegexPreTokenizer - Split behavior
+   */
+  CONFIG_MEMBER(std::string, behavior)
+
+  /**
+   * Used by RegexPreTokenizer - Split invert flag
+   */
+  CONFIG_MEMBER(bool, invert)
+
   /**
    * Used by: SequencePreTokenizer
    */
@@ -141,8 +156,29 @@ class PreTokenizerConfig {
 
 class RegexPreTokenizer : public PreTokenizer {
  public:
-  explicit RegexPreTokenizer(const std::string& pattern)
-      : regex_(RegexPreTokenizer::create_regex_(pattern)) {}
+  /**
+   * @param pattern: The regex pattern to use for token splitting
+   * @param is_delimiter: Whether treat `pattern` as delimiter characters, or
+   * use `pattern` as a regex pattern.
+   * @param behavior: Split behavior (only "MergedWithPrevious" supported)
+   * For example:
+   * "pre_tokenizer": {
+   *   "type": "Split",
+   *   "pattern": {
+   *     "String": " "
+   *   },
+   *   "behavior": "MergedWithPrevious",
+   *   "invert": false
+   * },
+   * Notice that the `invert` option is not supported.
+   */
+  explicit RegexPreTokenizer(
+      const std::string& pattern,
+      bool is_delimiter = false,
+      const std::string& behavior = "")
+      : regex_(RegexPreTokenizer::create_regex_(pattern)),
+        is_delimiter_(is_delimiter),
+        behavior_(behavior) {}
 
   /** Pre-tokenize with the stored regex */
   std::vector<std::string> pre_tokenize(const std::string& input) const;
@@ -151,6 +187,8 @@ class RegexPreTokenizer : public PreTokenizer {
   static std::unique_ptr<IRegex> create_regex_(const std::string& pattern);
 
   std::unique_ptr<IRegex> regex_;
+  const bool is_delimiter_;
+  const std::string behavior_;
 
 }; // end class RegexPreTokenizer
 
 
@@ -42,6 +42,14 @@ class IRegex {
    * @return A vector of strings containing all matched substrings.
    */
   virtual std::vector<Match> find_all(const std::string& text) const = 0;
+
+  /**
+   * @brief Escape special regex characters in a string to treat it as literal.
+   *
+   * @param input The input string to escape.
+   * @return The escaped string that can be used as a literal pattern in regex.
+   */
+  static std::string escape(const std::string& input);
 };
 
 // Function pointer type for create_fallback_regex implementations
 
@@ -65,6 +65,13 @@ class TokenDecoderConfig {
    */
   std::string type;
 
+  // Parameters for Replace decoder
+  std::string replace_pattern;
+  std::string replace_content;
+
+  // Parameters for Sequence decoder
+  std::vector<nlohmann::json> sequence_decoders;
+
   /*----------------*/
   /* Public methods */
   /*----------------*/
@@ -96,4 +103,49 @@ class ByteLevelTokenDecoder : public TokenDecoder {
 
 }; // end class ByteLevelTokenDecoder
 
+// -- Replace ------------------------------------------------------------------
+// Replaces a pattern with a replacement string
+
+class ReplaceTokenDecoder : public TokenDecoder {
+ public:
+  explicit ReplaceTokenDecoder(
+      const std::string& pattern,
+      const std::string& content);
+  std::string decode(const std::string& token) const override;
+
+ private:
+  std::string pattern_;
+  std::string content_;
+}; // end class ReplaceTokenDecoder
+
+// -- ByteFallback -------------------------------------------------------------
+// Handles byte fallback decoding
+
+class ByteFallbackTokenDecoder : public TokenDecoder {
+ public:
+  std::string decode(const std::string& token) const override;
+
+}; // end class ByteFallbackTokenDecoder
+
+// -- Fuse --------------------------------------------------------------------
+// Fuses tokens together
+
+class FuseTokenDecoder : public TokenDecoder {
+ public:
+  std::string decode(const std::string& token) const override;
+
+}; // end class FuseTokenDecoder
+
+// -- Sequence -----------------------------------------------------------------
+// Applies a sequence of decoders in order
+
+class SequenceTokenDecoder : public TokenDecoder {
+ public:
+  explicit SequenceTokenDecoder(std::vector<TokenDecoder::Ptr> decoders);
+  std::string decode(const std::string& token) const override;
+
+ private:
+  std::vector<TokenDecoder::Ptr> decoders_;
+}; // end class SequenceTokenDecoder
+
 } // namespace tokenizers
@@ -194,6 +194,7 @@ Result<std::vector<uint64_t>> BPETokenizerBase::byte_pair_encode_(
       return std::vector<uint64_t>(*result);
     } else {
       // TODO: is it possible?
+      TK_LOG(Error, "unknown token: '%s'", piece.c_str());
       return Error::EncodeFailure;
     }
   }
 
@@ -37,7 +37,24 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
       throw std::runtime_error(
           "Missing pattern for PreTokenizer of type Split");
     }
-    return PreTokenizer::Ptr(new RegexPreTokenizer(*pattern));
+
+    // Validate behavior parameter
+    std::string behavior_str = behavior ? *behavior : "";
+    if (!behavior_str.empty() && behavior_str != "MergedWithPrevious") {
+      throw std::runtime_error(
+          "Unsupported behavior '" + behavior_str +
+          "' for Split PreTokenizer. Only 'MergedWithPrevious' is supported.");
+    }
+
+    // Validate invert parameter
+    bool invert_flag = invert ? *invert : false;
+    if (invert_flag) {
+      throw std::runtime_error(
+          "invert=true is not supported for Split PreTokenizer. Only invert=false is supported.");
+    }
+
+    return PreTokenizer::Ptr(new RegexPreTokenizer(
+        *pattern, is_delimiter ? *is_delimiter : false, behavior_str));
   }
   if (type == "Digits") {
     if (individual_digits) {
@@ -79,7 +96,27 @@ PreTokenizerConfig& PreTokenizerConfig::parse_json(const json& json_config) {
   if (type == "Split") {
     try {
       pattern = json_config.at("pattern").at("Regex");
+      is_delimiter = false;
+    } catch (json::out_of_range&) {
+      // "Regex" is not there, check "String", which is a delimiter
+      std::string delimiter = json_config.at("pattern").at("String");
+      // For string patterns, escape regex special characters to treat them as
+      // literal strings (same as Rust's regex::escape)
+      pattern = IRegex::escape(delimiter);
+      is_delimiter = true;
+    }
+
+    // Parse behavior and invert fields
+    try {
+      behavior = json_config.at("behavior");
+    } catch (json::out_of_range&) {
+      // behavior is optional, default to empty string
+    }
+
+    try {
+      invert = json_config.at("invert");
     } catch (json::out_of_range&) {
+      // invert is optional, default to false
     }
   } else if (type == "Digits") {
     try {
@@ -115,9 +152,66 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
     const std::string& input) const {
   if (!regex_)
     return {};
+
   std::vector<std::string> results;
-  for (const auto& match : regex_->find_all(input)) {
-    results.push_back(input.substr(match.start, match.end - match.start));
+  auto matches = regex_->find_all(input);
+
+  if (!is_delimiter_) {
+    // Original behavior: return the matches themselves
+    for (const auto& match : matches) {
+      results.push_back(input.substr(match.start, match.end - match.start));
+    }
+  } else {
+    // Delimiter behavior
+    if (matches.empty()) {
+      // No matches found, return the entire input
+      results.push_back(input);
+      return results;
+    }
+
+    if (behavior_ == "MergedWithPrevious") {
+      // MergedWithPrevious: Include delimiter with previous token
+      // Example: "the-final--countdown" with delimiter "-"
+      // -> ["the-", "final-", "-", "countdown"]
+      size_t last_end = 0;
+
+      for (size_t i = 0; i < matches.size(); ++i) {
+        const auto& match = matches[i];
+
+        // Add text before the match plus the delimiter
+        if (match.start > last_end) {
+          std::string token = input.substr(last_end, match.end - last_end);
+          results.push_back(token);
+        } else {
+          // Only delimiter, no preceding text
+          std::string delimiter =
+              input.substr(match.start, match.end - match.start);
+          results.push_back(delimiter);
+        }
+
+        last_end = match.end;
+      }
+
+      // Add remaining text after the last match (if any)
+      if (last_end < input.length()) {
+        results.push_back(input.substr(last_end));
+      }
+    } else {
+      // Default delimiter behavior (split on delimiters)
+      size_t last_end = 0;
+      for (const auto& match : matches) {
+        // Add text before the match (if any)
+        if (match.start > last_end) {
+          results.push_back(input.substr(last_end, match.start - last_end));
+        }
+        last_end = match.end;
+      }
+
+      // Add remaining text after the last match (if any)
+      if (last_end < input.length()) {
+        results.push_back(input.substr(last_end));
+      }
+    }
   }
   return results;
 }
 
@@ -33,6 +33,23 @@ FallbackRegexFn get_fallback_regex() {
   return fallback_regex;
 }
 
+std::string IRegex::escape(const std::string& input) {
+  std::string result;
+  result.reserve(input.size() * 2); // Reserve space for potential escaping
+
+  for (char c : input) {
+    // Escape regex special characters to treat them as literal strings
+    if (c == '\\' || c == '^' || c == '$' || c == '.' || c == '|' || c == '?' ||
+        c == '*' || c == '+' || c == '(' || c == ')' || c == '[' || c == ']' ||
+        c == '{' || c == '}') {
+      result += '\\';
+    }
+    result += c;
+  }
+
+  return result;
+}
+
 Result<std::unique_ptr<IRegex>> create_regex(const std::string& pattern) {
   // Try RE2 first
   auto re2 = std::make_unique<Re2Regex>();
Original file line number	Diff line number	Diff line change
`@@ -194,6 +194,7 @@ Result<std::vector<uint64_t>> BPETokenizerBase::byte_pair_encode_(`
`194`	`194`	`return std::vector<uint64_t>(*result);`
`195`	`195`	`} else {`
`196`	`196`	`// TODO: is it possible?`
	`197`	`+ TK_LOG(Error, "unknown token: '%s'", piece.c_str());`
`197`	`198`	`return Error::EncodeFailure;`
`198`	`199`	`}`
`199`	`200`	`}`