@@ -141,6 +141,7 @@ public class PreTrainedTokenizer: Tokenizer {
141
141
142
142
private let addedTokens : Set < String >
143
143
private let specialTokens : [ String : Int ]
144
+ private let addedTokensRegex : NSRegularExpression ?
144
145
145
146
private let preTokenizer : PreTokenizer ?
146
147
private let normalizer : Normalizer ?
@@ -161,6 +162,16 @@ public class PreTrainedTokenizer: Tokenizer {
161
162
specialTokens [ content] = id
162
163
}
163
164
}
165
+
166
+ let addedTokensRegexString = ( tokenizerData. addedTokens? . arrayValue ?? [ ] ) . compactMap { addedToken in
167
+ guard let content = addedToken. content? . stringValue else { return nil }
168
+ let prefix = ( addedToken. lstrip? . boolValue ?? false ? #"\s*"# : " " )
169
+ let suffix = ( addedToken. rstrip? . boolValue ?? false ? #"\s*"# : " " )
170
+ let token = NSRegularExpression . escapedPattern ( for: content)
171
+ return " \( prefix) ( \( token) ) \( suffix) "
172
+ } . joined ( separator: " | " )
173
+ addedTokensRegex = try ? NSRegularExpression ( pattern: addedTokensRegexString, options: [ ] )
174
+
164
175
// TODO: specialTokens are stored but never used
165
176
self . specialTokens = specialTokens
166
177
self . addedTokens = Set ( addedTokens. keys)
@@ -211,7 +222,17 @@ public class PreTrainedTokenizer: Tokenizer {
211
222
}
212
223
213
224
public func tokenize( text: String ) -> [ String ] {
214
- preTokenize ( normalize ( text) ) . flatMap { model ( $0) }
225
+ // Take care of special tokens first
226
+ let sections : [ String ]
227
+ if let regex = self . addedTokensRegex {
228
+ sections = text. split ( by: regex)
229
+ } else {
230
+ sections = [ text]
231
+ }
232
+ return sections. map { x in
233
+ if addedTokens. contains ( x) { return [ x] }
234
+ return preTokenize ( normalize ( x) ) . flatMap { model ( $0) }
235
+ } . flatMap { $0 }
215
236
}
216
237
217
238
/// Main entry point
0 commit comments