@@ -262,9 +262,32 @@ public class PreTrainedTokenizer: Tokenizer {
262
262
263
263
public struct AutoTokenizer { }
264
264
265
+ struct PreTrainedTokenizerClasses {
266
+ /// Class overrides for custom behaviour
267
+ /// Not to be confused with the TokenizerModel classes defined in TokenizerModel
268
+ static let tokenizerClasses : [ String : PreTrainedTokenizer . Type ] = [
269
+ " LlamaTokenizer " : LlamaPreTrainedTokenizer . self
270
+ ]
271
+ }
272
+
265
273
extension AutoTokenizer {
274
+ static func tokenizerClass( for tokenizerConfig: Config ) -> PreTrainedTokenizer . Type {
275
+ guard let tokenizerClassName = tokenizerConfig. tokenizerClass? . stringValue else {
276
+ return PreTrainedTokenizer . self
277
+ }
278
+
279
+ // Some tokenizer_class entries use a Fast suffix
280
+ let tokenizerName = tokenizerClassName. replacingOccurrences ( of: " Fast " , with: " " )
281
+ if let tokenizerClass = PreTrainedTokenizerClasses . tokenizerClasses [ tokenizerName] {
282
+ return tokenizerClass
283
+ }
284
+
285
+ return PreTrainedTokenizer . self
286
+ }
287
+
266
288
public static func from( tokenizerConfig: Config , tokenizerData: Config ) throws -> Tokenizer {
267
- return try PreTrainedTokenizer ( tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
289
+ let tokenizerClass = tokenizerClass ( for: tokenizerConfig)
290
+ return try tokenizerClass. init ( tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
268
291
}
269
292
270
293
public static func from(
@@ -275,7 +298,7 @@ extension AutoTokenizer {
275
298
guard let tokenizerConfig = try await config. tokenizerConfig else { throw TokenizerError . missingConfig }
276
299
let tokenizerData = try await config. tokenizerData
277
300
278
- return try PreTrainedTokenizer ( tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
301
+ return try AutoTokenizer . from ( tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
279
302
}
280
303
281
304
public static func from(
@@ -302,3 +325,25 @@ class CodeLlamaTokenizer: BPETokenizer {}
302
325
class CohereTokenizer : BPETokenizer { }
303
326
304
327
class T5Tokenizer : UnigramTokenizer { }
328
+
329
+
330
+ // MARK: - PreTrainedTokenizer classes
331
+
332
+ let sentencePieceUnderline = " ▁ "
333
+
334
+ // See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions
335
+ class LlamaPreTrainedTokenizer : PreTrainedTokenizer {
336
+ let isLegacy : Bool
337
+
338
+ required init ( tokenizerConfig: Config , tokenizerData: Config ) throws {
339
+ isLegacy = tokenizerConfig. legacy? . boolValue ?? true
340
+ var configDictionary = tokenizerData. dictionary
341
+ if !isLegacy {
342
+ configDictionary. removeValue ( forKey: " normalizer " )
343
+ configDictionary [ " pre_tokenizer " ] = [ " type " : " Metaspace " , " replacement " : sentencePieceUnderline, " add_prefix_space " : true , " prepend_scheme " : " first " ]
344
+ }
345
+ let updatedData = Config ( configDictionary)
346
+
347
+ try super. init ( tokenizerConfig: tokenizerConfig, tokenizerData: updatedData)
348
+ }
349
+ }
0 commit comments