Skip to content

Commit 1fab24c

Browse files
authored
Deepseek R1 tokenization support and fixes (#159)
* Update Jinja, add Qwen R1 test * More robust added token support Serialized AddedToken class partially supported (in addition to String values) * Actually pass added tokens to decoder * Temporarily revert jinja upgrade
1 parent 313fbd7 commit 1fab24c

File tree

2 files changed

+22
-5
lines changed

2 files changed

+22
-5
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,11 +83,11 @@ class BPETokenizer: PreTrainedTokenizerModel {
8383
self.unknownToken = nil
8484
self.unknownTokenId = nil
8585
}
86-
87-
eosToken = tokenizerConfig.eosToken?.stringValue
86+
87+
eosToken = addedTokenAsString(tokenizerConfig.eosToken)
8888
eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]
8989

90-
bosToken = tokenizerConfig.bosToken?.stringValue
90+
bosToken = addedTokenAsString(tokenizerConfig.bosToken)
9191
bosTokenId = bosToken == nil ? nil : tokensToIds[bosToken! as NSString]
9292

9393
fuseUnknownTokens = tokenizerConfig.fuseUnk?.boolValue ?? false

Sources/Tokenizers/Tokenizer.swift

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,17 @@ public protocol TokenizingModel {
4141
var fuseUnknownTokens: Bool { get }
4242
}
4343

44+
// Helper - possibly to be moved somewhere else
45+
func addedTokenAsString(_ addedToken: Config?) -> String? {
46+
guard let addedToken = addedToken else { return nil }
47+
if let stringValue = addedToken.stringValue {
48+
return stringValue
49+
}
50+
// This is possibly a serialization of the AddedToken class
51+
// TODO: support lstrip, rstrip, normalized, etc.
52+
return addedToken.content?.stringValue
53+
}
54+
4455
public extension TokenizingModel {
4556
func callAsFunction(_ text: String) -> [String] {
4657
tokenize(text: text)
@@ -241,7 +252,7 @@ public class PreTrainedTokenizer: Tokenizer {
241252
self.preTokenizer = PreTokenizerFactory.fromConfig(config: tokenizerData.preTokenizer)
242253
self.normalizer = NormalizerFactory.fromConfig(config: tokenizerData.normalizer)
243254
self.postProcessor = PostProcessorFactory.fromConfig(config: tokenizerData.postProcessor)
244-
self.decoder = DecoderFactory.fromConfig(config: tokenizerData.decoder)
255+
self.decoder = DecoderFactory.fromConfig(config: tokenizerData.decoder, addedTokens: self.addedTokens)
245256
self.cleanUpTokenizationSpaces = tokenizerConfig.cleanUpTokenizationSpaces?.boolValue ?? true
246257
self.tokenizerConfig = tokenizerConfig
247258

@@ -421,7 +432,13 @@ public class PreTrainedTokenizer: Tokenizer {
421432
// TODO: maybe keep NSString here
422433
for (key, value) in tokenizerConfig.dictionary as [String : Any] {
423434
if specialTokenAttributes.contains(key), !(value is NSNull) {
424-
context[key] = value
435+
if let stringValue = value as? String {
436+
context[key] = stringValue
437+
} else if let dictionary = value as? [NSString:Any] {
438+
context[key] = addedTokenAsString(Config(dictionary))
439+
} else {
440+
context[key] = value
441+
}
425442
}
426443
}
427444

0 commit comments

Comments
 (0)