Skip to content

Commit 313fbd7

Browse files
authored
Bert fixes (#157)
* Lowercase normalization should have happened before * If stripAccents is null, strip when lowercase See https://docs.rs/tokenizers/latest/src/tokenizers/normalizers/bert.rs.html#119-137 * stripAccents keeps the base character! * Test case for stripAccents * A couple of Bert tests with diacritics * Map distilber tokenizer * BasicTokenizer lowercasing is now optional And stripAccents is performed when lowercasing * Fix decoder regexp * Couple of no lowercase tests * Format * BertNormalizer tests: update for new defaults * Additional tests * Punctuation rules * Edge cases for bert tokenizers * Remove code copied by mistake
1 parent a867fea commit 313fbd7

File tree

9 files changed

+161
-20
lines changed

9 files changed

+161
-20
lines changed

Sources/Tokenizers/BertTokenizer.swift

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import Foundation
1010
import Hub
1111

1212
public class BertTokenizer {
13-
private let basicTokenizer = BasicTokenizer()
13+
private let basicTokenizer: BasicTokenizer
1414
private let wordpieceTokenizer: WordpieceTokenizer
1515
private let maxLen = 512
1616
private let tokenizeChineseChars: Bool
@@ -30,10 +30,12 @@ public class BertTokenizer {
3030
tokenizeChineseChars: Bool = true,
3131
bosToken: String? = nil,
3232
eosToken: String? = nil,
33-
fuseUnknownTokens: Bool = false
33+
fuseUnknownTokens: Bool = false,
34+
doLowerCase: Bool = true
3435
) {
3536
self.vocab = vocab
3637
self.ids_to_tokens = Utils.invert(vocab)
38+
self.basicTokenizer = BasicTokenizer(doLowerCase: doLowerCase)
3739
self.wordpieceTokenizer = WordpieceTokenizer(vocab: self.vocab)
3840
self.tokenizeChineseChars = tokenizeChineseChars
3941
self.bosToken = bosToken
@@ -52,7 +54,8 @@ public class BertTokenizer {
5254
let eosToken = tokenizerConfig.eosToken?.stringValue
5355
let bosToken = tokenizerConfig.bosToken?.stringValue
5456
let fuseUnknown = tokenizerConfig.fuseUnk?.boolValue ?? false
55-
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken, fuseUnknownTokens: fuseUnknown)
57+
let doLowerCase = tokenizerConfig.doLowerCase?.boolValue ?? true
58+
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken, fuseUnknownTokens: fuseUnknown, doLowerCase: doLowerCase)
5659
}
5760

5861

@@ -154,21 +157,36 @@ extension BertTokenizer: PreTrainedTokenizerModel {
154157

155158

156159
class BasicTokenizer {
160+
let doLowerCase: Bool
161+
162+
init(doLowerCase: Bool = true) {
163+
self.doLowerCase = doLowerCase
164+
}
165+
157166
let neverSplit = [
158167
"[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"
159168
]
160-
169+
170+
func maybeStripAccents(_ text: String) -> String {
171+
guard doLowerCase else { return text }
172+
return text.folding(options: .diacriticInsensitive, locale: nil)
173+
}
174+
175+
func maybeLowercase(_ text: String) -> String {
176+
guard doLowerCase else { return text }
177+
return text.lowercased()
178+
}
179+
161180
func tokenize(text: String) -> [String] {
162-
let splitTokens = text.folding(options: .diacriticInsensitive, locale: nil)
163-
.components(separatedBy: NSCharacterSet.whitespaces)
181+
let splitTokens = maybeStripAccents(text).components(separatedBy: NSCharacterSet.whitespaces)
164182
let tokens = splitTokens.flatMap({ (token: String) -> [String] in
165183
if neverSplit.contains(token) {
166184
return [token]
167185
}
168186
var toks: [String] = []
169187
var currentTok = ""
170-
for c in token.lowercased() {
171-
if c.isLetter || c.isNumber || c == "°" {
188+
for c in maybeLowercase(token) {
189+
if !c.isExtendedPunctuation {
172190
currentTok += String(c)
173191
} else if currentTok.count > 0 {
174192
toks.append(currentTok)
@@ -187,6 +205,22 @@ class BasicTokenizer {
187205
}
188206
}
189207

208+
extension Character {
209+
/// https://github.com/huggingface/transformers/blob/8c1b5d37827a6691fef4b2d926f2d04fb6f5a9e3/src/transformers/tokenization_utils.py#L367
210+
var isExtendedPunctuation: Bool {
211+
if isPunctuation { return true }
212+
if let value = unicodeScalars.first?.value {
213+
switch value {
214+
case 33...47: return true
215+
case 58...64: return true
216+
case 91...96: return true
217+
case 123...126: return true
218+
default: return false
219+
}
220+
}
221+
return false
222+
}
223+
}
190224

191225
class WordpieceTokenizer {
192226
let unkToken = "[UNK]"

Sources/Tokenizers/Decoder.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ class WordPieceDecoder: Decoder {
5858
let cleanup: Bool
5959

6060
// https://github.com/huggingface/tokenizers/blob/main/tokenizers/src/decoders/wordpiece.rs#L31
61-
private let re = try! NSRegularExpression(pattern: "\\s(\\.|\\?|\\!|\\,|'|n't|'m|'s|'ve|'re)", options: [])
61+
private let re = try! NSRegularExpression(pattern: "\\s(\\.|\\?|\\!|\\,|'\\s|n't|'m|'s|'ve|'re)", options: [])
6262

6363
required public init(config: Config) {
6464
guard let prefix = config.prefix?.stringValue else { fatalError("Missing `prefix` configuration for WordPieceDecoder.") }

Sources/Tokenizers/Normalizer.swift

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -146,14 +146,14 @@ class NFKCNormalizer: Normalizer {
146146
class BertNormalizer: Normalizer {
147147
let shouldCleanText: Bool
148148
let shouldHandleChineseChars: Bool
149-
let shouldStripAccents: Bool?
149+
let shouldStripAccents: Bool
150150
let shouldLowercase: Bool
151151

152152
required init(config: Config) {
153153
self.shouldCleanText = config.cleanText?.boolValue ?? true
154154
self.shouldHandleChineseChars = config.handleChineseChars?.boolValue ?? true
155-
self.shouldStripAccents = config.stripAccents?.boolValue
156155
self.shouldLowercase = config.lowercase?.boolValue ?? true
156+
self.shouldStripAccents = config.stripAccents?.boolValue ?? shouldLowercase
157157
}
158158

159159
func normalize(text: String) -> String {
@@ -164,7 +164,7 @@ class BertNormalizer: Normalizer {
164164
if shouldHandleChineseChars {
165165
output = handleChineseChars(text: output)
166166
}
167-
if shouldStripAccents ?? false {
167+
if shouldStripAccents {
168168
output = stripAccents(text: output)
169169
}
170170
if shouldLowercase {
@@ -219,12 +219,10 @@ class BertNormalizer: Normalizer {
219219
}
220220

221221
private func stripAccents(text: String) -> String {
222-
text.decomposedStringWithCanonicalMapping
223-
.filter {
224-
$0.unicodeScalars.allSatisfy { scalar in
225-
!(0x0300 <= scalar.value && scalar.value <= 0x036F)
226-
}
227-
}
222+
// This might be the same as `text.folding(options: .diacriticInsensitive, locale: nil)`
223+
String(text.decomposedStringWithCanonicalMapping.unicodeScalars.filter { scalar in
224+
!(0x0300 <= scalar.value && scalar.value <= 0x036F)
225+
})
228226
}
229227
}
230228

Sources/Tokenizers/Tokenizer.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ public protocol PreTrainedTokenizerModel: TokenizingModel {
6363
struct TokenizerModel {
6464
static let knownTokenizers: [String : PreTrainedTokenizerModel.Type] = [
6565
"BertTokenizer" : BertTokenizer.self,
66+
"DistilbertTokenizer": BertTokenizer.self,
67+
"DistilBertTokenizer": BertTokenizer.self,
6668
"CodeGenTokenizer" : CodeGenTokenizer.self,
6769
"CodeLlamaTokenizer" : CodeLlamaTokenizer.self,
6870
"FalconTokenizer" : FalconTokenizer.self,
@@ -270,7 +272,8 @@ public class PreTrainedTokenizer: Tokenizer {
270272
func cleanUp(text: String) -> String {
271273
guard cleanUpTokenizationSpaces else { return text }
272274

273-
return text.replacingOccurrences(of: " .", with: ".")
275+
return text
276+
.replacingOccurrences(of: " .", with: ".")
274277
.replacingOccurrences(of: " ?", with: "?")
275278
.replacingOccurrences(of: " !", with: "!")
276279
.replacingOccurrences(of: " ,", with: ",")

Tests/NormalizerTests/NormalizerTests.swift

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,19 @@ class NormalizerTests: XCTestCase {
120120
XCTAssertNotNil(NormalizerFactory.fromConfig(config: config) as? NFKCNormalizer)
121121
}
122122

123+
func testStripAccents() {
124+
let testCases: [(String, String)] = [
125+
("département", "departement"),
126+
]
127+
128+
//TODO: test combinations with/without lowercase
129+
let config = Config(["stripAccents":true])
130+
let normalizer = BertNormalizer(config: config)
131+
for (arg, expect) in testCases {
132+
XCTAssertEqual(normalizer.normalize(text: arg), expect)
133+
}
134+
}
135+
123136
func testBertNormalizer() {
124137
let testCases: [(String, String)] = [
125138
("Café", "café"),
@@ -133,6 +146,30 @@ class NormalizerTests: XCTestCase {
133146
("\u{00C5}", "\u{00E5}"),
134147
]
135148

149+
for (arg, expect) in testCases {
150+
let config = Config(["stripAccents":false])
151+
let normalizer = BertNormalizer(config: config)
152+
XCTAssertEqual(normalizer.normalize(text: arg), expect)
153+
}
154+
155+
let config = Config(["type": NormalizerType.Bert.rawValue])
156+
XCTAssertNotNil(NormalizerFactory.fromConfig(config: config) as? BertNormalizer)
157+
}
158+
159+
func testBertNormalizerDefaults() {
160+
// Python verification: t._tokenizer.normalizer.normalize_str("Café")
161+
let testCases: [(String, String)] = [
162+
("Café", "cafe"),
163+
("François", "francois"),
164+
("Ωmega", "ωmega"),
165+
("über", "uber"),
166+
("háček", "hacek"),
167+
("Häagen\tDazs", "haagen dazs"),
168+
("你好!", " 你 好 !"),
169+
("𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼", "𝔄𝔅ℭ⓵⓶⓷︷,︸,i⁹,i₉,㌀,¼"),
170+
("Å", "a"),
171+
]
172+
136173
for (arg, expect) in testCases {
137174
let config = Config([:])
138175
let normalizer = BertNormalizer(config: config)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"text": "Fatouville-Grestain est une commune du Nord-Ouest du d\u00e9partement de l'Eure situ\u00e9e au \nbord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages \nde Haute-Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service \nde la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, \nla classe au sein du pays d'Auge (en tant que r\u00e9gion agricole).La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 \nl'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix-sept kilom\u00e8tres de Pont-Audemer.", "bpe_tokens": ["fat", "##ou", "##ville", "-", "gr", "##est", "##ain", "est", "une", "commune", "du", "nord", "-", "ou", "##est", "du", "depart", "##ement", "de", "l", "'", "eu", "##re", "situ", "##ee", "au", "bo", "##rd", "de", "l", "'", "est", "##ua", "##ire", "de", "la", "seine", "et", "a", "pro", "##xi", "##mite", "du", "depart", "##ement", "du", "cal", "##va", "##dos", ".", "se", "##lon", "l", "'", "atlas", "des", "pays", "##ages", "de", "haute", "-", "norman", "##die", ",", "elle", "app", "##art", "##ient", "a", "la", "region", "nature", "##lle", "du", "lieu", "##vin", ".", "to", "##ute", "##fo", "##is", ",", "l", "'", "ag", "##rest", "##e", ",", "le", "service", "de", "la", "stat", "##ist", "##ique", "et", "de", "la", "prospective", "du", "minister", "##e", "de", "l", "'", "agriculture", ",", "de", "l", "'", "ag", "##ro", "##ali", "##ment", "##aire", "et", "de", "la", "fore", "##t", ",", "la", "class", "##e", "au", "se", "##in", "du", "pays", "d", "'", "aug", "##e", "(", "en", "tan", "##t", "que", "region", "ag", "##ric", "##ole", ")", ".", "la", "commune", "est", "a", "moi", "##ns", "de", "di", "##x", "kilometres", "a", "l", "'", "est", "de", "hon", "##fle", "##ur", ",", "a", "au", "##tan", "##t", "de", "be", "##uze", "##ville", "et", "a", "en", "##vir", "##on", "di", "##x", "-", "sept", "kilometres", "de", "pont", "-", "au", "##de", "##mer", "."], "token_ids": [101, 6638, 7140, 3077, 1011, 24665, 4355, 8113, 9765, 16655, 5715, 4241, 13926, 1011, 15068, 4355, 4241, 18280, 13665, 2139, 1048, 1005, 7327, 2890, 26179, 4402, 8740, 8945, 4103, 2139, 1048, 1005, 9765, 6692, 7442, 2139, 2474, 16470, 3802, 1037, 4013, 9048, 23419, 4241, 18280, 13665, 4241, 10250, 3567, 12269, 1012, 7367, 7811, 1048, 1005, 11568, 4078, 12778, 13923, 2139, 18535, 1011, 5879, 10265, 1010, 15317, 10439, 8445, 11638, 1037, 2474, 2555, 3267, 6216, 4241, 22470, 6371, 1012, 2000, 10421, 14876, 2483, 1010, 1048, 1005, 12943, 28533, 2063, 1010, 3393, 2326, 2139, 2474, 28093, 2923, 7413, 3802, 2139, 2474, 17464, 4241, 2704, 2063, 2139, 1048, 1005, 5237, 1010, 2139, 1048, 1005, 12943, 3217, 11475, 3672, 14737, 3802, 2139, 2474, 18921, 2102, 1010, 2474, 2465, 2063, 8740, 7367, 2378, 4241, 12778, 1040, 1005, 15476, 2063, 1006, 4372, 9092, 2102, 10861, 2555, 12943, 7277, 9890, 1007, 1012, 2474, 5715, 9765, 1037, 25175, 3619, 2139, 4487, 2595, 3717, 1037, 1048, 1005, 9765, 2139, 10189, 21031, 3126, 1010, 1037, 8740, 5794, 2102, 2139, 2022, 20395, 3077, 3802, 1037, 4372, 21663, 2239, 4487, 2595, 1011, 17419, 3717, 2139, 21179, 1011, 8740, 3207, 5017, 1012, 102], "decoded_text": "[CLS] fatouville - grestain est une commune du nord - ouest du departement de l'eure situee au bord de l'estuaire de la seine et a proximite du departement du calvados. selon l'atlas des paysages de haute - normandie, elle appartient a la region naturelle du lieuvin. toutefois, l'agreste, le service de la statistique et de la prospective du ministere de l'agriculture, de l'agroalimentaire et de la foret, la classe au sein du pays d'auge ( en tant que region agricole ). la commune est a moins de dix kilometres a l'est de honfleur, a autant de beuzeville et a environ dix - sept kilometres de pont - audemer. [SEP]"}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"text": "Fatouville-Grestain est une commune du Nord-Ouest du d\u00e9partement de l'Eure situ\u00e9e au \nbord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages \nde Haute-Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service \nde la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, \nla classe au sein du pays d'Auge (en tant que r\u00e9gion agricole).La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 \nl'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix-sept kilom\u00e8tres de Pont-Audemer.", "bpe_tokens": ["Fat", "##ou", "##ville", "-", "G", "##resta", "##in", "est", "une", "commune", "du", "Nord", "-", "Ouest", "du", "d\u00e9partement", "de", "l", "'", "Eure", "situ\u00e9e", "au", "bord", "de", "l", "'", "est", "##uaire", "de", "la", "Seine", "et", "\u00e0", "proximit\u00e9", "du", "d\u00e9partement", "du", "Calvados", ".", "Selon", "l", "'", "atlas", "des", "paysage", "##s", "de", "Haute", "-", "Normandie", ",", "elle", "appartient", "\u00e0", "la", "r\u00e9gion", "naturelle", "du", "Lie", "##uv", "##in", ".", "Toutefois", ",", "l", "'", "A", "##gres", "##te", ",", "le", "service", "de", "la", "statistique", "et", "de", "la", "pro", "##spect", "##ive", "du", "minist\u00e8re", "de", "l", "'", "Agriculture", ",", "de", "l", "'", "A", "##gro", "##alim", "##entaire", "et", "de", "la", "For\u00eat", ",", "la", "classe", "au", "sein", "du", "pays", "d", "'", "Auge", "(", "en", "tant", "que", "r\u00e9gion", "agricole", ")", ".", "La", "commune", "est", "\u00e0", "moins", "de", "dix", "kilom\u00e8tres", "\u00e0", "l", "'", "est", "de", "Hon", "##f", "##leur", ",", "\u00e0", "autant", "de", "Be", "##uze", "##ville", "et", "\u00e0", "environ", "dix", "-", "sept", "kilom\u00e8tres", "de", "Pont", "-", "Aude", "##mer", "."], "token_ids": [101, 48803, 11010, 12043, 118, 144, 84038, 10245, 10176, 10231, 11380, 10168, 12004, 118, 21781, 10168, 16236, 10104, 180, 112, 35935, 15366, 10257, 27482, 10104, 180, 112, 10176, 54154, 10104, 10109, 13682, 10131, 254, 35483, 10168, 16236, 10168, 51934, 119, 20115, 180, 112, 92753, 10139, 93483, 10107, 10104, 17735, 118, 25771, 117, 11117, 52199, 254, 10109, 14387, 37232, 10168, 39710, 67000, 10245, 119, 46573, 117, 180, 112, 138, 68094, 10216, 117, 10141, 11989, 10104, 10109, 29303, 10131, 10104, 10109, 11284, 77229, 11942, 10168, 41853, 10104, 180, 112, 30954, 117, 10104, 180, 112, 138, 46692, 94974, 106895, 10131, 10104, 10109, 86549, 117, 10109, 15702, 10257, 11479, 10168, 13850, 172, 112, 72800, 113, 10110, 14222, 10121, 14387, 50350, 114, 119, 10159, 11380, 10176, 254, 14443, 10104, 23214, 22308, 254, 180, 112, 10176, 10104, 19431, 10575, 55692, 117, 254, 38585, 10104, 14321, 33302, 12043, 10131, 254, 16844, 23214, 118, 25097, 22308, 10104, 23986, 118, 55665, 12371, 119, 102], "decoded_text": "[CLS] Fatouville - Grestain est une commune du Nord - Ouest du d\u00e9partement de l'Eure situ\u00e9e au bord de l'estuaire de la Seine et \u00e0 proximit\u00e9 du d\u00e9partement du Calvados. Selon l'atlas des paysages de Haute - Normandie, elle appartient \u00e0 la r\u00e9gion naturelle du Lieuvin. Toutefois, l'Agreste, le service de la statistique et de la prospective du minist\u00e8re de l'Agriculture, de l'Agroalimentaire et de la For\u00eat, la classe au sein du pays d'Auge ( en tant que r\u00e9gion agricole ). La commune est \u00e0 moins de dix kilom\u00e8tres \u00e0 l'est de Honfleur, \u00e0 autant de Beuzeville et \u00e0 environ dix - sept kilom\u00e8tres de Pont - Audemer. [SEP]"}

Tests/TokenizersTests/Resources/tokenizer_tests.json

100755100644
Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ class T5TokenizerTests: TokenizerTests {
6060
override class var unknownTokenId: Int? { 2 }
6161
}
6262

63+
class BertCasedTokenizerTests: TokenizerTests {
64+
override class var hubModelName: String? { "distilbert/distilbert-base-multilingual-cased" }
65+
override class var encodedSamplesFilename: String? { "distilbert_cased_encoded" }
66+
override class var unknownTokenId: Int? { 100 }
67+
}
68+
69+
class BertUncasedTokenizerTests: TokenizerTests {
70+
override class var hubModelName: String? { "google-bert/bert-base-uncased" }
71+
override class var encodedSamplesFilename: String? { "bert_uncased_encoded" }
72+
override class var unknownTokenId: Int? { 100 }
73+
}
74+
6375
class GemmaTokenizerTests: TokenizerTests {
6476
override class var hubModelName: String? { "pcuenq/gemma-tokenizer" }
6577
override class var encodedSamplesFilename: String? { "gemma_encoded" }
@@ -108,6 +120,61 @@ class PhiSimpleTests: XCTestCase {
108120
}
109121
}
110122

123+
class BertDiacriticsTests: XCTestCase {
124+
func testBertCased() async throws {
125+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "distilbert/distilbert-base-multilingual-cased") as? PreTrainedTokenizer else {
126+
XCTFail()
127+
return
128+
}
129+
130+
XCTAssertEqual(tokenizer.encode(text: "mąka"), [101, 181, 102075, 10113, 102])
131+
XCTAssertEqual(tokenizer.tokenize(text: "Car"), ["Car"])
132+
}
133+
134+
func testBertCasedResaved() async throws {
135+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "pcuenq/distilbert-base-multilingual-cased-tokenizer") as? PreTrainedTokenizer else {
136+
XCTFail()
137+
return
138+
}
139+
140+
XCTAssertEqual(tokenizer.encode(text: "mąka"), [101, 181, 102075, 10113, 102])
141+
}
142+
143+
func testBertUncased() async throws {
144+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer else {
145+
XCTFail()
146+
return
147+
}
148+
149+
XCTAssertEqual(tokenizer.tokenize(text: "mąka"), ["ma", "##ka"])
150+
XCTAssertEqual(tokenizer.encode(text: "mąka"), [101, 5003, 2912, 102])
151+
XCTAssertEqual(tokenizer.tokenize(text: "département"), ["depart", "##ement"])
152+
XCTAssertEqual(tokenizer.encode(text: "département"), [101, 18280, 13665, 102])
153+
XCTAssertEqual(tokenizer.tokenize(text: "Car"), ["car"])
154+
155+
XCTAssertEqual(tokenizer.tokenize(text: "€4"), ["", "##4"])
156+
XCTAssertEqual(tokenizer.tokenize(text: "test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test"), ["test", "$", "1", "r", "##2", "#", "3", "", "##4", "£5", "¥", "##6", "[UNK]", "", "##8", "", "##9", "test"])
157+
}
158+
}
159+
160+
class BertSpacesTests: XCTestCase {
161+
func testEncodeDecode() async throws {
162+
guard let tokenizer = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer else {
163+
XCTFail()
164+
return
165+
}
166+
167+
let text = "l'eure"
168+
let tokenized = tokenizer.tokenize(text: text)
169+
XCTAssertEqual(tokenized, ["l", "'", "eu", "##re"])
170+
let encoded = tokenizer.encode(text: text)
171+
XCTAssertEqual(encoded, [101, 1048, 1005, 7327, 2890, 102])
172+
let decoded = tokenizer.decode(tokens: encoded, skipSpecialTokens: true)
173+
// Note: this matches the behaviour of the Python "slow" tokenizer, but the fast one produces "l ' eure"
174+
XCTAssertEqual(decoded, "l'eure")
175+
}
176+
}
177+
111178

112179
struct EncodedTokenizerSamplesDataset: Decodable {
113180
let text: String

0 commit comments

Comments
 (0)