Skip to content

Commit 1fdb4f4

Browse files
authored
Support "fuse unknown" configuration (#117)
* Bring over hf token envvar from preview branch * Add tests for Gemma, including edge cases Edge cases also added for other BPE tokenizers, but not for T5 yet. * Sort added tokens by length (descending) to avoid early partial matches Similar to huggingface/transformers.js@c305c38 * Store vocab as NSString to allow multiple tokens with the same Unicode canonical representation. * Remove comments * Go back to making vocab dictionaries private * Use ungated copy of Gemma tokenizer * Use NSString in UnigramTokenizer * Support fuse_unk configuration. Tested on one of the T5 testEdgeCases(). * Remove duplicate function
1 parent 4c8cf07 commit 1fdb4f4

File tree

5 files changed

+39
-11
lines changed

5 files changed

+39
-11
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ class BPETokenizer: PreTrainedTokenizerModel {
4545
public let unknownToken: String?
4646
public let unknownTokenId: Int?
4747

48+
public let fuseUnknownTokens: Bool
49+
4850
required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
4951
guard let merges = tokenizerData.model?.merges?.value as? [String] else { fatalError("BPETokenizer requires merges") }
5052
guard let vocab = tokenizerData.model?.vocab?.dictionary as? [NSString: Int] else {
@@ -75,6 +77,8 @@ class BPETokenizer: PreTrainedTokenizerModel {
7577

7678
bosToken = tokenizerConfig.bosToken?.stringValue
7779
bosTokenId = bosToken == nil ? nil : tokensToIds[bosToken! as NSString]
80+
81+
fuseUnknownTokens = tokenizerConfig.fuseUnk?.boolValue ?? false
7882
}
7983

8084
func convertTokenToId(_ token: String) -> Int? {

Sources/Tokenizers/BertTokenizer.swift

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,14 @@ public class BertTokenizer {
2323
public var eosToken: String?
2424
public var eosTokenId: Int?
2525

26+
public let fuseUnknownTokens: Bool
27+
2628
public init(vocab: [String: Int],
27-
merges: [String]?,
28-
tokenizeChineseChars: Bool = true,
29-
bosToken: String? = nil,
30-
eosToken: String? = nil
29+
merges: [String]?,
30+
tokenizeChineseChars: Bool = true,
31+
bosToken: String? = nil,
32+
eosToken: String? = nil,
33+
fuseUnknownTokens: Bool = false
3134
) {
3235
self.vocab = vocab
3336
self.ids_to_tokens = Utils.invert(vocab)
@@ -37,6 +40,7 @@ public class BertTokenizer {
3740
self.bosTokenId = bosToken == nil ? nil : vocab[bosToken!]
3841
self.eosToken = eosToken
3942
self.eosTokenId = eosToken == nil ? nil : vocab[eosToken!]
43+
self.fuseUnknownTokens = fuseUnknownTokens
4044
}
4145

4246
public required convenience init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
@@ -47,7 +51,8 @@ public class BertTokenizer {
4751
let tokenizeChineseChars = tokenizerConfig.handleChineseChars?.boolValue ?? true
4852
let eosToken = tokenizerConfig.eosToken?.stringValue
4953
let bosToken = tokenizerConfig.bosToken?.stringValue
50-
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken)
54+
let fuseUnknown = tokenizerConfig.fuseUnk?.boolValue ?? false
55+
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken, fuseUnknownTokens: fuseUnknown)
5156
}
5257

5358

Sources/Tokenizers/Tokenizer.swift

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ public protocol TokenizingModel {
3636
var eosTokenId: Int? { get }
3737
var unknownToken: String? { get }
3838
var unknownTokenId: Int? { get }
39+
40+
var fuseUnknownTokens: Bool { get }
3941
}
4042

4143
public extension TokenizingModel {
@@ -138,6 +140,7 @@ public class PreTrainedTokenizer: Tokenizer {
138140
public var eosTokenId: Int? { model.eosTokenId }
139141
public var unknownToken: String? { model.unknownToken }
140142
public var unknownTokenId: Int? { model.unknownTokenId }
143+
public var fuseUnknownTokens: Bool { model.fuseUnknownTokens }
141144

142145
private let addedTokens: Set<String>
143146
private let specialTokens: [String: Int]
@@ -232,6 +235,21 @@ public class PreTrainedTokenizer: Tokenizer {
232235
.replacingOccurrences(of: " 're", with: "'re")
233236
}
234237

238+
func fuseUnknown(_ tokens: [String]) -> [String] {
239+
guard fuseUnknownTokens else { return tokens }
240+
let (fused, _) = tokens.reduce((fused: [String](), previousIsUnknown: false)) { result, token in
241+
var (fused, previousIsUnknown) = result
242+
let isUnknown = model.convertTokenToId(token) == model.unknownTokenId
243+
if isUnknown {
244+
if !previousIsUnknown { fused.append(token) }
245+
} else {
246+
fused.append(token)
247+
}
248+
return (fused, isUnknown)
249+
}
250+
return fused
251+
}
252+
235253
public func tokenize(text: String) -> [String] {
236254
// Take care of special tokens first
237255
let sections: [String]
@@ -243,7 +261,7 @@ public class PreTrainedTokenizer: Tokenizer {
243261
return sections.enumerated().map { section, x in
244262
if addedTokens.contains(x) { return [x] }
245263
return preTokenize(normalize(x), options: section == 0 ? [.firstSection] : []).flatMap { model($0) }
246-
}.flatMap { $0 }
264+
}.flatMap { fuseUnknown($0) }
247265
}
248266

249267
/// Main entry point

Sources/Tokenizers/UnigramTokenizer.swift

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
2929
let bosTokenId: Int?
3030
let eosToken: String?
3131
let eosTokenId: Int?
32-
32+
33+
// Hardcoded in Unigram tokenizers
34+
let fuseUnknownTokens: Bool = true
35+
3336
private let trie: Trie<Character>
3437

3538
required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
@@ -71,10 +74,8 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
7174

7275
trie = Trie()
7376
trie.append(contentsOf: vocab.map { $0.token })
74-
75-
// TODO: set fuse_unk to true
7677
}
77-
78+
7879
func convertTokenToId(_ token: String) -> Int? {
7980
return tokensToIds[token as NSString] ?? self.unknownTokenId
8081
}

Tests/TokenizersTests/Resources/tokenizer_tests.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)