File tree 3 files changed +0
-14
lines changed
3 files changed +0
-14
lines changed Original file line number Diff line number Diff line change @@ -163,7 +163,6 @@ class BPETokenizer: PreTrainedTokenizerModel {
163
163
let bpeTokens = self . bpe ( token: text) . split ( separator: " " ) . map { String ( $0) }
164
164
for token in bpeTokens {
165
165
if convertTokenToId ( token) != unknownTokenId {
166
- // if let _ = tokensToIds[token as NSString] {
167
166
tokens. append ( token)
168
167
} else {
169
168
// TODO: if config.byte_fallback is False, append the unknown token instead
Original file line number Diff line number Diff line change @@ -183,15 +183,6 @@ public class PreTrainedTokenizer: Tokenizer {
183
183
} . joined ( separator: " | " )
184
184
addedTokensRegex = try ? NSRegularExpression ( pattern: addedTokensRegexString, options: [ ] )
185
185
186
- // let addedTokensRegexString = (tokenizerData.addedTokens?.arrayValue ?? []).compactMap { addedToken in
187
- // guard let content = addedToken.content?.stringValue else { return nil }
188
- // let prefix = (addedToken.lstrip?.boolValue ?? false ? #"\s*"# : "")
189
- // let suffix = (addedToken.rstrip?.boolValue ?? false ? #"\s*"# : "")
190
- // let token = NSRegularExpression.escapedPattern(for: content)
191
- // return "\(prefix)(\(token))\(suffix)"
192
- // }.joined(separator: "|")
193
- // addedTokensRegex = try? NSRegularExpression(pattern: addedTokensRegexString, options: [])
194
-
195
186
// TODO: specialTokens are stored but never used
196
187
self . specialTokens = specialTokens
197
188
self . addedTokens = Set ( addedTokens. keys)
Original file line number Diff line number Diff line change @@ -70,10 +70,6 @@ class GemmaTokenizerTests: TokenizerTests {
70
70
let cases = [ " à " /* 0x61 0x300 */, " à " /* 0xe0 */]
71
71
let expected = [ 217138 , 1305 ]
72
72
73
- // for x in cases.map { $0.unicodeScalars.map { String(format:"0x%lX", $0.value) } } {
74
- // print(x)
75
- // }
76
-
77
73
// These are different characters
78
74
for (s, expected) in zip ( cases, expected) {
79
75
let encoded = await tester. tokenizer? . encode ( text: " " + s)
You can’t perform that action at this time.
0 commit comments