Skip to content

Commit fa875e6

Browse files
committed
Create LlamaPreTrainedTokenizer subclass
We need some custom behaviour that's not in the config :(
1 parent 2eb5995 commit fa875e6

File tree

1 file changed

+47
-2
lines changed

1 file changed

+47
-2
lines changed

Sources/Tokenizers/Tokenizer.swift

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -262,9 +262,32 @@ public class PreTrainedTokenizer: Tokenizer {
262262

263263
public struct AutoTokenizer {}
264264

265+
struct PreTrainedTokenizerClasses {
266+
/// Class overrides for custom behaviour
267+
/// Not to be confused with the TokenizerModel classes defined in TokenizerModel
268+
static let tokenizerClasses: [String : PreTrainedTokenizer.Type] = [
269+
"LlamaTokenizer": LlamaPreTrainedTokenizer.self
270+
]
271+
}
272+
265273
extension AutoTokenizer {
274+
static func tokenizerClass(for tokenizerConfig: Config) -> PreTrainedTokenizer.Type {
275+
guard let tokenizerClassName = tokenizerConfig.tokenizerClass?.stringValue else {
276+
return PreTrainedTokenizer.self
277+
}
278+
279+
// Some tokenizer_class entries use a Fast suffix
280+
let tokenizerName = tokenizerClassName.replacingOccurrences(of: "Fast", with: "")
281+
if let tokenizerClass = PreTrainedTokenizerClasses.tokenizerClasses[tokenizerName] {
282+
return tokenizerClass
283+
}
284+
285+
return PreTrainedTokenizer.self
286+
}
287+
266288
public static func from(tokenizerConfig: Config, tokenizerData: Config) throws -> Tokenizer {
267-
return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
289+
let tokenizerClass = tokenizerClass(for: tokenizerConfig)
290+
return try tokenizerClass.init(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
268291
}
269292

270293
public static func from(
@@ -275,7 +298,7 @@ extension AutoTokenizer {
275298
guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig }
276299
let tokenizerData = try await config.tokenizerData
277300

278-
return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
301+
return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
279302
}
280303

281304
public static func from(
@@ -302,3 +325,25 @@ class CodeLlamaTokenizer: BPETokenizer {}
302325
class CohereTokenizer : BPETokenizer {}
303326

304327
class T5Tokenizer : UnigramTokenizer {}
328+
329+
330+
// MARK: - PreTrainedTokenizer classes
331+
332+
let sentencePieceUnderline = ""
333+
334+
// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions
335+
class LlamaPreTrainedTokenizer: PreTrainedTokenizer {
336+
let isLegacy: Bool
337+
338+
required init(tokenizerConfig: Config, tokenizerData: Config) throws {
339+
isLegacy = tokenizerConfig.legacy?.boolValue ?? true
340+
var configDictionary = tokenizerData.dictionary
341+
if !isLegacy {
342+
configDictionary.removeValue(forKey: "normalizer")
343+
configDictionary["pre_tokenizer"] = ["type": "Metaspace", "replacement": sentencePieceUnderline, "add_prefix_space": true, "prepend_scheme": "first"]
344+
}
345+
let updatedData = Config(configDictionary)
346+
347+
try super.init(tokenizerConfig: tokenizerConfig, tokenizerData: updatedData)
348+
}
349+
}

0 commit comments

Comments
 (0)