Add support for more behavior in Split pretokenizer (#93)

larryliu0820 · jackzhxng · web-flow · commit 448fded403e2 · 2025-07-02T15:46:08.000-07:00
* Use a small tokenizer.json for unit test

[ghstack-poisoned]

* Add support for more behavior in Split pretokenizer

Adding support for `Remove` and `Isolated` behavior.

Currently all the behaviors supported:
* `MergedWithPrevious`
* `Remove`
* `Isolated`

According to the Rust implementation:

```
/// Defines the expected behavior for the delimiter of a Split Pattern
/// When splitting on `'-'` for example, with input `the-final--countdown`:
///  - Removed =&gt; `[ "the", "final", "countdown" ]`
///  - Isolated =&gt; `[ "the", "-", "final", "-", "-", "countdown" ]`
///  - MergedWithPrevious =&gt; `[ "the-", "final-", "-", "countdown" ]`
///  - MergedWithNext =&gt; `[ "the", "-final", "-", "-countdown" ]`
///  - Contiguous =&gt; `[ "the", "-", "final", "--", "countdown" ]`
```

Manually tested qwen3 tokenizer.json. Will add integration tests later.

[ghstack-poisoned]

* Update include/pytorch/tokenizers/pre_tokenizer.h

Co-authored-by: Jack &lt;32371937+jackzhxng@users.noreply.github.com&gt;

---------

Co-authored-by: Jack &lt;32371937+jackzhxng@users.noreply.github.com&gt;
diff --git a/include/pytorch/tokenizers/pre_tokenizer.h b/include/pytorch/tokenizers/pre_tokenizer.h
@@ -160,25 +160,36 @@ class RegexPreTokenizer : public PreTokenizer {
    * @param pattern: The regex pattern to use for token splitting
    * @param is_delimiter: Whether treat `pattern` as delimiter characters, or
    * use `pattern` as a regex pattern.
-   * @param behavior: Split behavior (only "MergedWithPrevious" supported)
-   * For example:
-   * "pre_tokenizer": {
-   *   "type": "Split",
-   *   "pattern": {
+   * @param behavior: Split behavior ("MergedWithPrevious" or "Isolated"
+   * supported) For example: "pre_tokenizer": { "type": "Split", "pattern": {
    *     "String": " "
    *   },
-   *   "behavior": "MergedWithPrevious",
+   *   "behavior": "Isolated",
    *   "invert": false
    * },
+   *
+   * Behavior options:
+   * - "MergedWithPrevious": Include delimiter with previous token
+   *   Example: "the-final--countdown" -> ["the-", "final-", "-", "countdown"]
+   * - "Isolated": Keep delimiters as separate tokens
+   *   Example: "the-final--countdown" -> ["the", "-", "final", "-", "-",
+   * "countdown"]
+   *
    * Notice that the `invert` option is not supported.
    */
   explicit RegexPreTokenizer(
       const std::string& pattern,
       bool is_delimiter = false,
-      const std::string& behavior = "")
+      const std::string& behavior = "Removed")
       : regex_(RegexPreTokenizer::create_regex_(pattern)),
         is_delimiter_(is_delimiter),
-        behavior_(behavior) {}
+        behavior_(behavior) {
+    if (behavior_.empty() ||
+        (behavior_ != "Removed" && behavior_ != "MergedWithPrevious" &&
+         behavior_ != "Isolated")) {
+      throw std::runtime_error("Unsupported behavior: " + behavior_);
+    }
+  }
 
   /** Pre-tokenize with the stored regex */
   std::vector<std::string> pre_tokenize(const std::string& input) const;
diff --git a/src/pre_tokenizer.cpp b/src/pre_tokenizer.cpp
@@ -38,12 +38,13 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
           "Missing pattern for PreTokenizer of type Split");
     }
 
-    // Validate behavior parameter
-    std::string behavior_str = behavior ? *behavior : "";
-    if (!behavior_str.empty() && behavior_str != "MergedWithPrevious") {
+    // Validate behavior parameter, if missing set to default "Removed"
+    std::string behavior_str = behavior ? *behavior : "Removed";
+    if (behavior_str != "MergedWithPrevious" && behavior_str != "Isolated" &&
+        behavior_str != "Removed") {
       throw std::runtime_error(
           "Unsupported behavior '" + behavior_str +
-          "' for Split PreTokenizer. Only 'MergedWithPrevious' is supported.");
+          "' for Split PreTokenizer. Only 'MergedWithPrevious', 'Removed' and 'Isolated' are supported.");
     }
 
     // Validate invert parameter
@@ -196,8 +197,31 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
       if (last_end < input.length()) {
         results.push_back(input.substr(last_end));
       }
-    } else {
-      // Default delimiter behavior (split on delimiters)
+    } else if (behavior_ == "Isolated") {
+      // Isolated: Keep delimiters as separate tokens
+      // Example: "the-final--countdown" with delimiter "-"
+      // -> ["the", "-", "final", "-", "-", "countdown"]
+      size_t last_end = 0;
+      for (const auto& match : matches) {
+        // Add text before the match (if any)
+        if (match.start > last_end) {
+          results.push_back(input.substr(last_end, match.start - last_end));
+        }
+
+        // Add the delimiter itself as a separate token
+        std::string delimiter =
+            input.substr(match.start, match.end - match.start);
+        results.push_back(delimiter);
+
+        last_end = match.end;
+      }
+
+      // Add remaining text after the last match (if any)
+      if (last_end < input.length()) {
+        results.push_back(input.substr(last_end));
+      }
+    } else if (behavior_ == "Removed" || behavior_.empty()) {
+      // Default delimiter behavior (split on delimiters, remove delimiters)
       size_t last_end = 0;
       for (const auto& match : matches) {
         // Add text before the match (if any)
diff --git a/test/test_isolated_behavior.cpp b/test/test_isolated_behavior.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+// @lint-ignore-every LICENSELINT
+
+#include <gtest/gtest.h>
+#include <pytorch/tokenizers/pre_tokenizer.h>
+
+using namespace tokenizers;
+
+TEST(IsolatedBehaviorTest, BasicIsolatedBehavior) {
+  // Test the example from the comment: "the-final--countdown" -> ["the", "-",
+  // "final", "-", "-", "countdown"]
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "the-final--countdown";
+  std::vector<std::string> expected = {
+      "the", "-", "final", "-", "-", "countdown"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, SingleDelimiter) {
+  // Test with single delimiter
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "hello-world";
+  std::vector<std::string> expected = {"hello", "-", "world"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, NoDelimiters) {
+  // Test with no delimiters
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "helloworld";
+  std::vector<std::string> expected = {"helloworld"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  EXPECT_EQ(result[0], expected[0]);
+}
+
+TEST(IsolatedBehaviorTest, DelimiterAtStart) {
+  // Test with delimiter at start
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "-hello";
+  std::vector<std::string> expected = {"-", "hello"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, DelimiterAtEnd) {
+  // Test with delimiter at end
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "hello-";
+  std::vector<std::string> expected = {"hello", "-"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, OnlyDelimiters) {
+  // Test with only delimiters
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "---";
+  std::vector<std::string> expected = {"-", "-", "-"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, SpaceDelimiter) {
+  // Test with space as delimiter
+  RegexPreTokenizer tokenizer(" ", true, "Isolated");
+  std::string input = "hello world test";
+  std::vector<std::string> expected = {"hello", " ", "world", " ", "test"};
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, JSONConfig) {
+  // Test with JSON configuration
+  nlohmann::json config = {
+      {"type", "Split"},
+      {"pattern", {{"String", "-"}}},
+      {"behavior", "Isolated"},
+      {"invert", false}};
+
+  PreTokenizerConfig pre_config;
+  pre_config.parse_json(config);
+  auto tokenizer = pre_config.create();
+
+  std::string input = "the-final--countdown";
+  std::vector<std::string> expected = {
+      "the", "-", "final", "-", "-", "countdown"};
+  std::vector<std::string> result = tokenizer->pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), expected.size());
+  for (size_t i = 0; i < expected.size(); ++i) {
+    EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
+  }
+}
+
+TEST(IsolatedBehaviorTest, EmptyInput) {
+  // Test with empty input
+  RegexPreTokenizer tokenizer("-", true, "Isolated");
+  std::string input = "";
+  std::vector<std::string> result = tokenizer.pre_tokenize(input);
+
+  EXPECT_EQ(result.size(), 1);
+  EXPECT_EQ(result[0], "");
+}
diff --git a/test/test_pre_tokenizer.cpp b/test/test_pre_tokenizer.cpp
@@ -366,7 +366,7 @@ TEST_F(PreTokenizerConfigTest, SplitWithUnsupportedBehavior) {
           .parse_json(json{
               {"type", "Split"},
               {"pattern", {{"String", "-"}}},
-              {"behavior", "Isolated"},
+              {"behavior", "MergedWithNext"},
               {"invert", false},
           })
           .create(),