@@ -19,11 +19,12 @@ using namespace tokenizers;
19
19
20
20
// Helpers /////////////////////////////////////////////////////////////////////
21
21
22
- static void assert_split_match (const PreTokenizer &ptok,
23
- const std::string &prompt,
24
- const std::vector<std::string> &expected) {
22
+ static void assert_split_match (
23
+ const PreTokenizer& ptok,
24
+ const std::string& prompt,
25
+ const std::vector<std::string>& expected) {
25
26
re2::StringPiece prompt_view (prompt);
26
- const auto & got = ptok.pre_tokenize (prompt_view);
27
+ const auto & got = ptok.pre_tokenize (prompt_view);
27
28
EXPECT_EQ (expected.size (), got.size ());
28
29
for (auto i = 0 ; i < got.size (); ++i) {
29
30
EXPECT_EQ (expected[i], got[i]);
@@ -34,14 +35,16 @@ static void assert_split_match(const PreTokenizer &ptok,
34
35
class RegexPreTokenizerTest : public ::testing::Test {};
35
36
36
37
// Test the basic construction
37
- TEST_F (RegexPreTokenizerTest, Construct) { RegexPreTokenizer ptok (" [0-9]+" ); }
38
+ TEST_F (RegexPreTokenizerTest, Construct) {
39
+ RegexPreTokenizer ptok (" [0-9]+" );
40
+ }
38
41
39
42
// Test basic splitting using the expression for Tiktoken
40
43
TEST_F (RegexPreTokenizerTest, TiktokenExpr) {
41
44
RegexPreTokenizer ptok (
42
45
R"( (?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+)" );
43
- assert_split_match (ptok, " How are you doing? " ,
44
- {" How" , " are" , " you" , " doing" , " ?" });
46
+ assert_split_match (
47
+ ptok, " How are you doing? " , {" How" , " are" , " you" , " doing" , " ?" });
45
48
}
46
49
47
50
// DigitsPreTokenizer //////////////////////////////////////////////////////////
@@ -51,15 +54,18 @@ class DigitsPreTokenizerTest : public ::testing::Test {};
51
54
TEST_F (DigitsPreTokenizerTest, IndividualDigits) {
52
55
DigitsPreTokenizer ptok (true );
53
56
assert_split_match (
54
- ptok, " The number 1 then 234 then 5." ,
57
+ ptok,
58
+ " The number 1 then 234 then 5." ,
55
59
{" The number " , " 1" , " then " , " 2" , " 3" , " 4" , " then " , " 5" , " ." });
56
60
}
57
61
58
62
// Test digit splitting with contiguous digits
59
63
TEST_F (DigitsPreTokenizerTest, ContiguousDigits) {
60
64
DigitsPreTokenizer ptok (false );
61
- assert_split_match (ptok, " The number 1 then 234 then 5." ,
62
- {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
65
+ assert_split_match (
66
+ ptok,
67
+ " The number 1 then 234 then 5." ,
68
+ {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
63
69
}
64
70
65
71
// ByteLevelPreTokenizer ///////////////////////////////////////////////////////
@@ -69,7 +75,8 @@ TEST_F(ByteLevelPreTokenizerTest, PreTokenizeDefault) {
69
75
ByteLevelPreTokenizer ptok;
70
76
assert_split_match (ptok, " Hello World" , {" ĠHello" , " ĠWorld" });
71
77
assert_split_match (
72
- ptok, " The number 1 then 234 then 5." ,
78
+ ptok,
79
+ " The number 1 then 234 then 5." ,
73
80
{" ĠThe" , " Ġnumber" , " Ġ1" , " Ġthen" , " Ġ234" , " Ġthen" , " Ġ5" , " ." });
74
81
}
75
82
@@ -90,9 +97,22 @@ TEST_F(SequencePreTokenizerTest, PreTokenizeDigitAndByteLevel) {
90
97
PreTokenizer::Ptr dptok (new DigitsPreTokenizer (true ));
91
98
PreTokenizer::Ptr bptok (new ByteLevelPreTokenizer (false ));
92
99
SequencePreTokenizer ptok ({dptok, bptok});
93
- assert_split_match (ptok, " The number 1 then 234 then 5." ,
94
- {" The" , " Ġnumber" , " Ġ" , " 1" , " Ġthen" , " Ġ" , " 2" , " 3" , " 4" ,
95
- " Ġthen" , " Ġ" , " 5" , " ." });
100
+ assert_split_match (
101
+ ptok,
102
+ " The number 1 then 234 then 5." ,
103
+ {" The" ,
104
+ " Ġnumber" ,
105
+ " Ġ" ,
106
+ " 1" ,
107
+ " Ġthen" ,
108
+ " Ġ" ,
109
+ " 2" ,
110
+ " 3" ,
111
+ " 4" ,
112
+ " Ġthen" ,
113
+ " Ġ" ,
114
+ " 5" ,
115
+ " ." });
96
116
}
97
117
98
118
// PreTokenizerConfig //////////////////////////////////////////////////////////
@@ -132,12 +152,14 @@ TEST_F(PreTokenizerConfigTest, AllTypesFailureCases) {
132
152
133
153
// Sequence
134
154
EXPECT_THROW (PreTokenizerConfig (" Sequence" ).create (), std::runtime_error);
135
- EXPECT_THROW (PreTokenizerConfig (" Sequence" ).set_pretokenizers ({}).create (),
136
- std::runtime_error);
137
- EXPECT_THROW (PreTokenizerConfig (" Sequence" )
138
- .set_pretokenizers ({PreTokenizerConfig (" Split" )})
139
- .create (),
140
- std::runtime_error);
155
+ EXPECT_THROW (
156
+ PreTokenizerConfig (" Sequence" ).set_pretokenizers ({}).create (),
157
+ std::runtime_error);
158
+ EXPECT_THROW (
159
+ PreTokenizerConfig (" Sequence" )
160
+ .set_pretokenizers ({PreTokenizerConfig (" Split" )})
161
+ .create (),
162
+ std::runtime_error);
141
163
142
164
// Unsupported
143
165
EXPECT_THROW (PreTokenizerConfig (" Unsupported" ).create (), std::runtime_error);
@@ -161,9 +183,22 @@ TEST_F(PreTokenizerConfigTest, ParseJson) {
161
183
}},
162
184
})
163
185
.create ();
164
- assert_split_match (*ptok, " The number 1 then 234 then 5." ,
165
- {" The" , " Ġnumber" , " Ġ" , " 1" , " Ġthen" , " Ġ" , " 2" , " 3" , " 4" ,
166
- " Ġthen" , " Ġ" , " 5" , " ." });
186
+ assert_split_match (
187
+ *ptok,
188
+ " The number 1 then 234 then 5." ,
189
+ {" The" ,
190
+ " Ġnumber" ,
191
+ " Ġ" ,
192
+ " 1" ,
193
+ " Ġthen" ,
194
+ " Ġ" ,
195
+ " 2" ,
196
+ " 3" ,
197
+ " 4" ,
198
+ " Ġthen" ,
199
+ " Ġ" ,
200
+ " 5" ,
201
+ " ." });
167
202
}
168
203
169
204
TEST_F (PreTokenizerConfigTest, ParseJsonOptionalKey) {
@@ -173,8 +208,10 @@ TEST_F(PreTokenizerConfigTest, ParseJsonOptionalKey) {
173
208
{" type" , " Digits" },
174
209
})
175
210
.create ();
176
- assert_split_match (*ptok, " The number 1 then 234 then 5." ,
177
- {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
211
+ assert_split_match (
212
+ *ptok,
213
+ " The number 1 then 234 then 5." ,
214
+ {" The number " , " 1" , " then " , " 234" , " then " , " 5" , " ." });
178
215
}
179
216
180
217
TEST_F (PreTokenizerConfigTest, Split) {
0 commit comments