Skip to content

Commit 3bd0226

Browse files
authored
Export bosToken and eosToken (#70)
1 parent 07f6707 commit 3bd0226

File tree

4 files changed

+43
-7
lines changed

4 files changed

+43
-7
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,13 @@ class BPETokenizer: PreTrainedTokenizerModel {
3636
private let tokensToIds: [String: Int]
3737
private let idsToTokens: [Int: String]
3838

39+
public let bosToken: String?
40+
public let bosTokenId: Int?
41+
public let eosToken: String?
42+
public let eosTokenId: Int?
3943
public let unknownToken: String?
4044
public let unknownTokenId: Int?
41-
45+
4246
required init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
4347
guard let merges = tokenizerData.model?.merges?.value as? [String] else { fatalError("BPETokenizer requires merges") }
4448
guard let vocab = tokenizerData.model?.vocab?.dictionary as? [String: Int] else {
@@ -55,14 +59,20 @@ class BPETokenizer: PreTrainedTokenizerModel {
5559
self.tokensToIds = vocab.merging(addedTokens) { $1 }
5660
self.idsToTokens = Utils.invert(self.tokensToIds)
5761

58-
// Populate unknown token
62+
// Populate tokens
5963
if let unknownToken = TokenizerModel.unknownToken(from: tokenizerConfig) {
6064
self.unknownToken = unknownToken
6165
self.unknownTokenId = self.tokensToIds[unknownToken]
6266
} else {
6367
self.unknownToken = nil
6468
self.unknownTokenId = nil
6569
}
70+
71+
eosToken = tokenizerConfig.eosToken?.stringValue
72+
eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken!]
73+
74+
bosToken = tokenizerConfig.bosToken?.stringValue
75+
bosTokenId = bosToken == nil ? nil : tokensToIds[bosToken!]
6676
}
6777

6878
func convertTokenToId(_ token: String) -> Int? {

Sources/Tokenizers/BertTokenizer.swift

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,26 @@ class BertTokenizer {
1717

1818
private let vocab: [String: Int]
1919
private let ids_to_tokens: [Int: String]
20-
20+
21+
var bosToken: String?
22+
var bosTokenId: Int?
23+
var eosToken: String?
24+
var eosTokenId: Int?
25+
2126
init(vocab: [String: Int],
2227
merges: [String]?,
23-
tokenizeChineseChars: Bool = true) {
28+
tokenizeChineseChars: Bool = true,
29+
bosToken: String? = nil,
30+
eosToken: String? = nil
31+
) {
2432
self.vocab = vocab
2533
self.ids_to_tokens = Utils.invert(vocab)
2634
self.wordpieceTokenizer = WordpieceTokenizer(vocab: self.vocab)
2735
self.tokenizeChineseChars = tokenizeChineseChars
36+
self.bosToken = bosToken
37+
self.bosTokenId = bosToken == nil ? nil : vocab[bosToken!]
38+
self.eosToken = eosToken
39+
self.eosTokenId = eosToken == nil ? nil : vocab[eosToken!]
2840
}
2941

3042
required convenience init(tokenizerConfig: Config, tokenizerData: Config, addedTokens: [String : Int]) throws {
@@ -33,7 +45,9 @@ class BertTokenizer {
3345
}
3446
let merges = tokenizerData.model?.merges?.value as? [String]
3547
let tokenizeChineseChars = tokenizerConfig.handleChineseChars?.boolValue ?? true
36-
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars)
48+
let eosToken = tokenizerConfig.eosToken?.stringValue
49+
let bosToken = tokenizerConfig.bosToken?.stringValue
50+
self.init(vocab: vocab, merges: merges, tokenizeChineseChars: tokenizeChineseChars, bosToken: bosToken, eosToken: eosToken)
3751
}
3852

3953

Sources/Tokenizers/Tokenizer.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ public protocol TokenizingModel {
3030
func convertIdToToken(_ id: Int) -> String?
3131
func convertIdsToTokens(_ ids: [Int]) -> [String?]
3232

33+
var bosToken: String? { get }
34+
var bosTokenId: Int? { get }
35+
var eosToken: String? { get }
36+
var eosTokenId: Int? { get }
3337
var unknownToken: String? { get }
3438
var unknownTokenId: Int? { get }
3539
}
@@ -103,6 +107,10 @@ public protocol Tokenizer {
103107
func convertIdToToken(_ id: Int) -> String?
104108
func convertIdsToTokens(_ ids: [Int]) -> [String?]
105109

110+
var bosToken: String? { get }
111+
var bosTokenId: Int? { get }
112+
var eosToken: String? { get }
113+
var eosTokenId: Int? { get }
106114
var unknownToken: String? { get }
107115
var unknownTokenId: Int? { get }
108116
}
@@ -124,6 +132,10 @@ public extension Tokenizer {
124132
public class PreTrainedTokenizer: Tokenizer {
125133
let model: TokenizingModel
126134

135+
public var bosToken: String? { model.bosToken }
136+
public var bosTokenId: Int? { model.bosTokenId }
137+
public var eosToken: String? { model.eosToken }
138+
public var eosTokenId: Int? { model.eosTokenId }
127139
public var unknownToken: String? { model.unknownToken }
128140
public var unknownTokenId: Int? { model.unknownTokenId }
129141

Sources/Tokenizers/UnigramTokenizer.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
2424
let minScore: Float
2525
let tokensToIds: [String: Int]
2626

27-
let bosToken = " "
27+
let bosToken: String? = " "
2828
let bosTokenId: Int?
2929
let eosToken: String?
3030
let eosTokenId: Int?
@@ -51,7 +51,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel {
5151
self.unknownPiece = SentencePieceToken(token: vocab[unknownTokenId].token, score: minScore - 10)
5252

5353
tokensToIds = Dictionary(uniqueKeysWithValues: vocab.map { $0.token }.enumerated().map { ($1, $0) })
54-
bosTokenId = tokensToIds[bosToken] // May be nil
54+
bosTokenId = tokensToIds[bosToken!] // May be nil
5555

5656
eosToken = tokenizerConfig.eosToken?.stringValue
5757
eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken!]

0 commit comments

Comments
 (0)