Skip to content

Commit bbbd7bf

Browse files
pcuencadavidkoski
andauthored
Gemma tokenizer, fix Unicode split (#52)
* Add GemmaTokenizer class * Fix unicode split, and do not omit empty sequences This makes it equivalent to the Python and JavaScript versions as far as I can tell. Co-authored-by: davidkoski <davidkoski@users.noreply.github.com> --------- Co-authored-by: davidkoski <davidkoski@users.noreply.github.com>
1 parent 1a001b5 commit bbbd7bf

File tree

2 files changed

+19
-17
lines changed

2 files changed

+19
-17
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ class BPETokenizer: PreTrainedTokenizerModel {
4646
}
4747
var bpeRanks: Dictionary<BytePair, Int> = [:]
4848
for (i, item) in merges.enumerated() {
49-
let tuple = item.split(separator: " ").map { String($0) }
49+
let tuple = item.unicodeScalars.split(separator: " ", omittingEmptySubsequences: false).map { String($0) }
5050
let bp = BytePair(tuple: tuple)
5151
bpeRanks[bp] = i
5252
}

Sources/Tokenizers/Tokenizer.swift

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,16 @@ public protocol PreTrainedTokenizerModel: TokenizingModel {
5555

5656
struct TokenizerModel {
5757
static let knownTokenizers: [String : PreTrainedTokenizerModel.Type] = [
58-
"BertTokenizer" : BertTokenizer.self,
59-
"GPT2Tokenizer" : GPT2Tokenizer.self,
60-
"FalconTokenizer" : FalconTokenizer.self,
61-
"LlamaTokenizer" : LlamaTokenizer.self,
62-
"CodeLlamaTokenizer": LlamaTokenizer.self,
63-
"CodeGenTokenizer": CodeGenTokenizer.self,
64-
"WhisperTokenizer": WhisperTokenizer.self,
65-
"T5Tokenizer" : T5Tokenizer.self,
66-
67-
// Default
58+
"BertTokenizer" : BertTokenizer.self,
59+
"CodeGenTokenizer" : CodeGenTokenizer.self,
60+
"CodeLlamaTokenizer" : CodeLlamaTokenizer.self,
61+
"FalconTokenizer" : FalconTokenizer.self,
62+
"GemmaTokenizer" : GemmaTokenizer.self,
63+
"GPT2Tokenizer" : GPT2Tokenizer.self,
64+
"LlamaTokenizer" : LlamaTokenizer.self,
65+
"T5Tokenizer" : T5Tokenizer.self,
66+
"WhisperTokenizer" : WhisperTokenizer.self,
67+
6868
"PreTrainedTokenizer": BPETokenizer.self
6969
]
7070

@@ -245,10 +245,12 @@ extension AutoTokenizer {
245245

246246
// MARK: - Tokenizer model classes
247247

248-
class GPT2Tokenizer : BPETokenizer {}
249-
class FalconTokenizer : BPETokenizer {}
250-
class LlamaTokenizer : BPETokenizer {}
251-
class CodeGenTokenizer : BPETokenizer {}
252-
class WhisperTokenizer : BPETokenizer {}
248+
class GPT2Tokenizer : BPETokenizer {}
249+
class FalconTokenizer : BPETokenizer {}
250+
class LlamaTokenizer : BPETokenizer {}
251+
class CodeGenTokenizer : BPETokenizer {}
252+
class WhisperTokenizer : BPETokenizer {}
253+
class GemmaTokenizer : BPETokenizer {}
254+
class CodeLlamaTokenizer: BPETokenizer {}
253255

254-
class T5Tokenizer : UnigramTokenizer {}
256+
class T5Tokenizer : UnigramTokenizer {}

0 commit comments

Comments
 (0)