Skip to content

Commit 448fded

Browse files
Add support for more behavior in Split pretokenizer (#93)
* Use a small tokenizer.json for unit test [ghstack-poisoned] * Add support for more behavior in Split pretokenizer Adding support for `Remove` and `Isolated` behavior. Currently all the behaviors supported: * `MergedWithPrevious` * `Remove` * `Isolated` According to the Rust implementation: ``` /// Defines the expected behavior for the delimiter of a Split Pattern /// When splitting on `'-'` for example, with input `the-final--countdown`: /// - Removed => `[ "the", "final", "countdown" ]` /// - Isolated => `[ "the", "-", "final", "-", "-", "countdown" ]` /// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]` /// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]` /// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]` ``` Manually tested qwen3 tokenizer.json. Will add integration tests later. [ghstack-poisoned] * Update include/pytorch/tokenizers/pre_tokenizer.h Co-authored-by: Jack <32371937+jackzhxng@users.noreply.github.com> --------- Co-authored-by: Jack <32371937+jackzhxng@users.noreply.github.com>
1 parent f5896f8 commit 448fded

File tree

4 files changed

+187
-15
lines changed

4 files changed

+187
-15
lines changed

include/pytorch/tokenizers/pre_tokenizer.h

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -160,25 +160,36 @@ class RegexPreTokenizer : public PreTokenizer {
160160
* @param pattern: The regex pattern to use for token splitting
161161
* @param is_delimiter: Whether treat `pattern` as delimiter characters, or
162162
* use `pattern` as a regex pattern.
163-
* @param behavior: Split behavior (only "MergedWithPrevious" supported)
164-
* For example:
165-
* "pre_tokenizer": {
166-
* "type": "Split",
167-
* "pattern": {
163+
* @param behavior: Split behavior ("MergedWithPrevious" or "Isolated"
164+
* supported) For example: "pre_tokenizer": { "type": "Split", "pattern": {
168165
* "String": " "
169166
* },
170-
* "behavior": "MergedWithPrevious",
167+
* "behavior": "Isolated",
171168
* "invert": false
172169
* },
170+
*
171+
* Behavior options:
172+
* - "MergedWithPrevious": Include delimiter with previous token
173+
* Example: "the-final--countdown" -> ["the-", "final-", "-", "countdown"]
174+
* - "Isolated": Keep delimiters as separate tokens
175+
* Example: "the-final--countdown" -> ["the", "-", "final", "-", "-",
176+
* "countdown"]
177+
*
173178
* Notice that the `invert` option is not supported.
174179
*/
175180
explicit RegexPreTokenizer(
176181
const std::string& pattern,
177182
bool is_delimiter = false,
178-
const std::string& behavior = "")
183+
const std::string& behavior = "Removed")
179184
: regex_(RegexPreTokenizer::create_regex_(pattern)),
180185
is_delimiter_(is_delimiter),
181-
behavior_(behavior) {}
186+
behavior_(behavior) {
187+
if (behavior_.empty() ||
188+
(behavior_ != "Removed" && behavior_ != "MergedWithPrevious" &&
189+
behavior_ != "Isolated")) {
190+
throw std::runtime_error("Unsupported behavior: " + behavior_);
191+
}
192+
}
182193

183194
/** Pre-tokenize with the stored regex */
184195
std::vector<std::string> pre_tokenize(const std::string& input) const;

src/pre_tokenizer.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,13 @@ PreTokenizer::Ptr PreTokenizerConfig::create() const {
3838
"Missing pattern for PreTokenizer of type Split");
3939
}
4040

41-
// Validate behavior parameter
42-
std::string behavior_str = behavior ? *behavior : "";
43-
if (!behavior_str.empty() && behavior_str != "MergedWithPrevious") {
41+
// Validate behavior parameter, if missing set to default "Removed"
42+
std::string behavior_str = behavior ? *behavior : "Removed";
43+
if (behavior_str != "MergedWithPrevious" && behavior_str != "Isolated" &&
44+
behavior_str != "Removed") {
4445
throw std::runtime_error(
4546
"Unsupported behavior '" + behavior_str +
46-
"' for Split PreTokenizer. Only 'MergedWithPrevious' is supported.");
47+
"' for Split PreTokenizer. Only 'MergedWithPrevious', 'Removed' and 'Isolated' are supported.");
4748
}
4849

4950
// Validate invert parameter
@@ -196,8 +197,31 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(
196197
if (last_end < input.length()) {
197198
results.push_back(input.substr(last_end));
198199
}
199-
} else {
200-
// Default delimiter behavior (split on delimiters)
200+
} else if (behavior_ == "Isolated") {
201+
// Isolated: Keep delimiters as separate tokens
202+
// Example: "the-final--countdown" with delimiter "-"
203+
// -> ["the", "-", "final", "-", "-", "countdown"]
204+
size_t last_end = 0;
205+
for (const auto& match : matches) {
206+
// Add text before the match (if any)
207+
if (match.start > last_end) {
208+
results.push_back(input.substr(last_end, match.start - last_end));
209+
}
210+
211+
// Add the delimiter itself as a separate token
212+
std::string delimiter =
213+
input.substr(match.start, match.end - match.start);
214+
results.push_back(delimiter);
215+
216+
last_end = match.end;
217+
}
218+
219+
// Add remaining text after the last match (if any)
220+
if (last_end < input.length()) {
221+
results.push_back(input.substr(last_end));
222+
}
223+
} else if (behavior_ == "Removed" || behavior_.empty()) {
224+
// Default delimiter behavior (split on delimiters, remove delimiters)
201225
size_t last_end = 0;
202226
for (const auto& match : matches) {
203227
// Add text before the match (if any)

test/test_isolated_behavior.cpp

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
// @lint-ignore-every LICENSELINT
9+
10+
#include <gtest/gtest.h>
11+
#include <pytorch/tokenizers/pre_tokenizer.h>
12+
13+
using namespace tokenizers;
14+
15+
TEST(IsolatedBehaviorTest, BasicIsolatedBehavior) {
16+
// Test the example from the comment: "the-final--countdown" -> ["the", "-",
17+
// "final", "-", "-", "countdown"]
18+
RegexPreTokenizer tokenizer("-", true, "Isolated");
19+
std::string input = "the-final--countdown";
20+
std::vector<std::string> expected = {
21+
"the", "-", "final", "-", "-", "countdown"};
22+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
23+
24+
EXPECT_EQ(result.size(), expected.size());
25+
for (size_t i = 0; i < expected.size(); ++i) {
26+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
27+
}
28+
}
29+
30+
TEST(IsolatedBehaviorTest, SingleDelimiter) {
31+
// Test with single delimiter
32+
RegexPreTokenizer tokenizer("-", true, "Isolated");
33+
std::string input = "hello-world";
34+
std::vector<std::string> expected = {"hello", "-", "world"};
35+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
36+
37+
EXPECT_EQ(result.size(), expected.size());
38+
for (size_t i = 0; i < expected.size(); ++i) {
39+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
40+
}
41+
}
42+
43+
TEST(IsolatedBehaviorTest, NoDelimiters) {
44+
// Test with no delimiters
45+
RegexPreTokenizer tokenizer("-", true, "Isolated");
46+
std::string input = "helloworld";
47+
std::vector<std::string> expected = {"helloworld"};
48+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
49+
50+
EXPECT_EQ(result.size(), expected.size());
51+
EXPECT_EQ(result[0], expected[0]);
52+
}
53+
54+
TEST(IsolatedBehaviorTest, DelimiterAtStart) {
55+
// Test with delimiter at start
56+
RegexPreTokenizer tokenizer("-", true, "Isolated");
57+
std::string input = "-hello";
58+
std::vector<std::string> expected = {"-", "hello"};
59+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
60+
61+
EXPECT_EQ(result.size(), expected.size());
62+
for (size_t i = 0; i < expected.size(); ++i) {
63+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
64+
}
65+
}
66+
67+
TEST(IsolatedBehaviorTest, DelimiterAtEnd) {
68+
// Test with delimiter at end
69+
RegexPreTokenizer tokenizer("-", true, "Isolated");
70+
std::string input = "hello-";
71+
std::vector<std::string> expected = {"hello", "-"};
72+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
73+
74+
EXPECT_EQ(result.size(), expected.size());
75+
for (size_t i = 0; i < expected.size(); ++i) {
76+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
77+
}
78+
}
79+
80+
TEST(IsolatedBehaviorTest, OnlyDelimiters) {
81+
// Test with only delimiters
82+
RegexPreTokenizer tokenizer("-", true, "Isolated");
83+
std::string input = "---";
84+
std::vector<std::string> expected = {"-", "-", "-"};
85+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
86+
87+
EXPECT_EQ(result.size(), expected.size());
88+
for (size_t i = 0; i < expected.size(); ++i) {
89+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
90+
}
91+
}
92+
93+
TEST(IsolatedBehaviorTest, SpaceDelimiter) {
94+
// Test with space as delimiter
95+
RegexPreTokenizer tokenizer(" ", true, "Isolated");
96+
std::string input = "hello world test";
97+
std::vector<std::string> expected = {"hello", " ", "world", " ", "test"};
98+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
99+
100+
EXPECT_EQ(result.size(), expected.size());
101+
for (size_t i = 0; i < expected.size(); ++i) {
102+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
103+
}
104+
}
105+
106+
TEST(IsolatedBehaviorTest, JSONConfig) {
107+
// Test with JSON configuration
108+
nlohmann::json config = {
109+
{"type", "Split"},
110+
{"pattern", {{"String", "-"}}},
111+
{"behavior", "Isolated"},
112+
{"invert", false}};
113+
114+
PreTokenizerConfig pre_config;
115+
pre_config.parse_json(config);
116+
auto tokenizer = pre_config.create();
117+
118+
std::string input = "the-final--countdown";
119+
std::vector<std::string> expected = {
120+
"the", "-", "final", "-", "-", "countdown"};
121+
std::vector<std::string> result = tokenizer->pre_tokenize(input);
122+
123+
EXPECT_EQ(result.size(), expected.size());
124+
for (size_t i = 0; i < expected.size(); ++i) {
125+
EXPECT_EQ(result[i], expected[i]) << "Mismatch at index " << i;
126+
}
127+
}
128+
129+
TEST(IsolatedBehaviorTest, EmptyInput) {
130+
// Test with empty input
131+
RegexPreTokenizer tokenizer("-", true, "Isolated");
132+
std::string input = "";
133+
std::vector<std::string> result = tokenizer.pre_tokenize(input);
134+
135+
EXPECT_EQ(result.size(), 1);
136+
EXPECT_EQ(result[0], "");
137+
}

test/test_pre_tokenizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ TEST_F(PreTokenizerConfigTest, SplitWithUnsupportedBehavior) {
366366
.parse_json(json{
367367
{"type", "Split"},
368368
{"pattern", {{"String", "-"}}},
369-
{"behavior", "Isolated"},
369+
{"behavior", "MergedWithNext"},
370370
{"invert", false},
371371
})
372372
.create(),

0 commit comments

Comments
 (0)