Skip to content

Commit a867fea

Browse files
Fix Phi 4 tokenization (#153)
* Fix Phi 4 tokenization * Fix: remove empty trailing pretokenized match * Realistic pre-tokenizer tests, double-checked with transformers.js * Couple more test cases Co-authored-by: Anthony <anthony@depasquale.org>
1 parent 43ae46d commit a867fea

File tree

3 files changed

+27
-11
lines changed

3 files changed

+27
-11
lines changed

Sources/Tokenizers/PreTokenizer.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ extension StringSplitPattern {
248248
func split(_ text: String, invert: Bool = true) -> [String] {
249249
switch self {
250250
case .regexp(let regexp):
251-
return text.split(by: regexp, includeSeparators: !invert)
251+
return text.split(by: regexp, includeSeparators: true)
252252
case .string(let substring):
253253
return text.split(by: substring, options: [], includeSeparators: !invert)
254254
}
@@ -292,7 +292,9 @@ public extension String {
292292
start = range.upperBound
293293
}
294294

295-
result.append(String(self[start...]))
295+
if omittingEmptySubsequences && start < endIndex {
296+
result.append(String(self[start...]))
297+
}
296298
return result
297299
}
298300

Tests/PreTokenizerTests/PreTokenizerTests.swift

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class PreTokenizerTests: XCTestCase {
119119
)
120120
XCTAssertEqual(
121121
preTokenizer1.preTokenize(text: " Hey, friend, what's up? "),
122-
[" ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", " ", "up?", " ", " ", ""]
122+
[" ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", " ", "up?", " ", " "]
123123
)
124124

125125
let preTokenizer2 = SplitPreTokenizer(config: Config(["pattern": ["Regex": "\\s"]]))
@@ -133,21 +133,22 @@ class PreTokenizerTests: XCTestCase {
133133
)
134134
XCTAssertEqual(
135135
preTokenizer2.preTokenize(text: " Hey, friend, what's up? "),
136-
[" ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", " ", "up?", " ", " ", ""]
136+
[" ", " ", " ", "Hey,", " ", " ", " ", " ", "friend,", " ", " ", " ", " ", "what's", " ", "up?", " ", " "]
137137
)
138138

139-
let preTokenizer3 = SplitPreTokenizer(config: Config(["pattern": ["Regex": "\\s"], "invert": true]))
139+
let preTokenizer3 = SplitPreTokenizer(config: Config(["pattern": ["Regex": "(?i:\'s|\'t|\'re|\'ve|\'m|\'ll|\'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"], "invert": true]))
140140
XCTAssertEqual(
141-
preTokenizer3.preTokenize(text: "Hey friend!"),
142-
["Hey", "friend!"]
141+
preTokenizer3.preTokenize(text: "Hello"),
142+
["Hello"]
143143
)
144+
144145
XCTAssertEqual(
145-
preTokenizer3.preTokenize(text: "Hey friend! How are you?!?"),
146-
["Hey", "friend!", "How", "are", "you?!?"]
146+
preTokenizer3.preTokenize(text: "Hey friend!"),
147+
["Hey", " friend", "!"]
147148
)
148149
XCTAssertEqual(
149-
preTokenizer3.preTokenize(text: " Hey, friend, what's up? "),
150-
["Hey,", "friend,", "what's", "up?", ""]
150+
preTokenizer3.preTokenize(text: "Hey friend! How are you?!?"),
151+
["Hey", " friend", "!", " ", " How", " are", " you", "?!?"]
151152
)
152153
}
153154

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,19 @@ class GemmaUnicodeTests: XCTestCase {
9595
}
9696
}
9797

98+
class PhiSimpleTests: XCTestCase {
99+
func testPhi4() async throws {
100+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/phi-4") as? PreTrainedTokenizer else {
101+
XCTFail()
102+
return
103+
}
104+
105+
XCTAssertEqual(tokenizer.encode(text: "hello"), [15339])
106+
XCTAssertEqual(tokenizer.encode(text: "hello world"), [15339, 1917])
107+
XCTAssertEqual(tokenizer.encode(text: "<|im_start|>user<|im_sep|>Who are you?<|im_end|><|im_start|>assistant<|im_sep|>"), [100264, 882, 100266, 15546, 527, 499, 30, 100265, 100264, 78191, 100266])
108+
}
109+
}
110+
98111

99112
struct EncodedTokenizerSamplesDataset: Decodable {
100113
let text: String

0 commit comments

Comments
 (0)