Skip to content

Commit 4f31334

Browse files
authored
Support new merges serialization (#131)
* Support new merges serialization Introduced in tokenizers 0.20.0. Tokenizers saved with it will create a `merges` property where each merge is an array of two items, instead of a string with a separator. * nit
1 parent 0f23067 commit 4f31334

File tree

1 file changed

+16
-4
lines changed

1 file changed

+16
-4
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,27 @@ class BPETokenizer: PreTrainedTokenizerModel {
4747

4848
public let fuseUnknownTokens: Bool
4949

50+
static func mergesFromConfig(_ config: Config?) -> [[String]]? {
51+
guard let config = config else { return nil }
52+
53+
// New format (pushed with tokenizers >= 0.20.0): each merge is a list of 2 items
54+
if let merges = config.value as? [[String]] { return merges }
55+
56+
// Legacy: each merge is a string
57+
guard let merges = config.value as? [String] else { return nil }
58+
return merges.map { mergeString in
59+
mergeString.unicodeScalars.split(separator: " ", omittingEmptySubsequences: false).map { String($0) }
60+
}
61+
}
62+
5063
required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
51-
guard let merges = tokenizerData.model?.merges?.value as? [String] else { fatalError("BPETokenizer requires merges") }
64+
guard let merges = Self.mergesFromConfig(tokenizerData.model?.merges) else { fatalError("BPETokenizer requires merges") }
5265
guard let vocab = tokenizerData.model?.vocab?.dictionary as? [NSString: Int] else {
5366
throw TokenizerError.missingVocab
5467
}
5568
var bpeRanks: Dictionary<BytePair, Int> = [:]
56-
for (i, item) in merges.enumerated() {
57-
let tuple = item.unicodeScalars.split(separator: " ", omittingEmptySubsequences: false).map { String($0) }
58-
let bp = BytePair(tuple: tuple)
69+
for (i, merge) in merges.enumerated() {
70+
let bp = BytePair(tuple: merge)
5971
bpeRanks[bp] = i
6072
}
6173
self.bpeRanks = bpeRanks

0 commit comments

Comments
 (0)