Chat templates by @maiqingqiang (#104)

pcuenca · johnmai-dev · web-flow · commit 5d89b5d9ab5a · 2024-09-01T16:58:53.000+02:00
* add jinja package

* support chat template

* Support `addSpecialTokens`.

* Remove padding for now

We need to get back to this to support consistently.

---------

Co-authored-by: John Mai &lt;maiqingqiang@foxmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ DerivedData/
 .swiftpm/config/registries.json
 .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
 .netrc
+.idea
diff --git a/Package.swift b/Package.swift
@@ -12,7 +12,8 @@ let package = Package(
         .executable(name: "hub-cli", targets: ["HubCLI"]),
     ],
     dependencies: [
-        .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.4.0")
+        .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.4.0"),
+        .package(url: "https://github.com/maiqingqiang/Jinja", branch: "main")
     ],
     targets: [
         .executableTarget(
@@ -22,7 +23,7 @@ let package = Package(
                 .product(name: "ArgumentParser", package: "swift-argument-parser")]),
         .executableTarget(name: "HubCLI", dependencies: ["Hub", .product(name: "ArgumentParser", package: "swift-argument-parser")]),
         .target(name: "Hub", resources: [.process("FallbackConfigs")]),
-        .target(name: "Tokenizers", dependencies: ["Hub"]),
+        .target(name: "Tokenizers", dependencies: ["Hub", .product(name: "Jinja", package: "Jinja")]),
         .target(name: "TensorUtils"),
         .target(name: "Generation", dependencies: ["Tokenizers", "TensorUtils"]),
         .target(name: "Models", dependencies: ["Tokenizers", "Generation", "TensorUtils"]),
diff --git a/Sources/Tokenizers/BPETokenizer.swift b/Sources/Tokenizers/BPETokenizer.swift
@@ -71,7 +71,7 @@ class BPETokenizer: PreTrainedTokenizerModel {
             self.unknownToken = nil
             self.unknownTokenId = nil
         }
-
+        
         eosToken = tokenizerConfig.eosToken?.stringValue
         eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]
 
diff --git a/Sources/Tokenizers/PostProcessor.swift b/Sources/Tokenizers/PostProcessor.swift
@@ -9,15 +9,15 @@ import Foundation
 import Hub
 
 public protocol PostProcessor {
-    func postProcess(tokens: [String], tokensPair: [String]?) -> [String]
-    func callAsFunction(tokens: [String], tokensPair: [String]?) -> [String]
-    
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String]
+    func callAsFunction(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool) -> [String]
+
     init(config: Config)
 }
 
 extension PostProcessor {
-    func callAsFunction(tokens: [String], tokensPair: [String]? = nil) -> [String] {
-        return postProcess(tokens: tokens, tokensPair: tokensPair)
+    func callAsFunction(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
+        return postProcess(tokens: tokens, tokensPair: tokensPair, addSpecialTokens: addSpecialTokens)
     }
 }
 
@@ -53,13 +53,15 @@ class TemplateProcessing: PostProcessor {
         self.pair = pair
     }
     
-    func postProcess(tokens: [String], tokensPair: [String]? = nil) -> [String] {
+    func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] {
         let config = tokensPair == nil ? single : pair
-                
+
         var toReturn: [String] = []
         for item in config {
             if let specialToken = item.SpecialToken {
-                toReturn.append(specialToken.id!.stringValue!)
+                if addSpecialTokens {
+                    toReturn.append(specialToken.id!.stringValue!)
+                }
             } else if let sequence = item.Sequence {
                 if sequence.id?.stringValue == "A" {
                     toReturn += tokens
@@ -74,7 +76,7 @@ class TemplateProcessing: PostProcessor {
 
 class ByteLevelPostProcessor: PostProcessor {
     required public init(config: Config) {}
-    func postProcess(tokens: [String], tokensPair: [String]? = nil) -> [String] { tokens }
+    func postProcess(tokens: [String], tokensPair: [String]? = nil, addSpecialTokens: Bool = true) -> [String] { tokens }
 }
 
 class RobertaProcessing: PostProcessor {
@@ -94,7 +96,7 @@ class RobertaProcessing: PostProcessor {
         self.addPrefixSpace = config.addPrefixSpace?.boolValue ?? true
     }
     
-    func postProcess(tokens: [String], tokensPair: [String]?) -> [String] {
+    func postProcess(tokens: [String], tokensPair: [String]?, addSpecialTokens: Bool = true) -> [String] {
         var outTokens = tokens
         var tokensPair = tokensPair
         if trimOffset {
diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift
@@ -7,6 +7,7 @@
 
 import Hub
 import Foundation
+import Jinja
 
 enum TokenizerError : Error {
     case missingConfig
@@ -98,7 +99,8 @@ public protocol Tokenizer {
 
     /// Main entry point
     func encode(text: String) -> [Int]
-    func callAsFunction(_ text: String) -> [Int]
+    func encode(text: String, addSpecialTokens: Bool) -> [Int]
+    func callAsFunction(_ text: String, addSpecialTokens: Bool) -> [Int]
 
     /// Decode
     func decode(tokens: [Int]) -> String
@@ -115,11 +117,21 @@ public protocol Tokenizer {
     var eosTokenId: Int? { get }
     var unknownToken: String? { get }
     var unknownTokenId: Int? { get }
+    
+    func applyChatTemplate(messages: [[String: String]]) throws -> [Int]
+    
+    func applyChatTemplate(
+        messages: [[String: String]],
+        chatTemplate: String?,
+        addGenerationPrompt: Bool,
+        truncation: Bool,
+        maxLength: Int?
+    ) throws -> [Int]
 }
 
 public extension Tokenizer {
-    func callAsFunction(_ text: String) -> [Int] {
-        encode(text: text)
+    func callAsFunction(_ text: String, addSpecialTokens: Bool = true) -> [Int] {
+        encode(text: text, addSpecialTokens: addSpecialTokens)
     }
 
     func convertTokensToIds(_ tokens: [String]) -> [Int?] {
@@ -131,6 +143,17 @@ public extension Tokenizer {
     }
 }
 
+let specialTokenAttributes: [String] = [
+    "bos_token",
+    "eos_token",
+    "unk_token",
+    "sep_token",
+    "pad_token",
+    "cls_token",
+    "mask_token",
+    "additional_special_tokens"
+]
+
 public class PreTrainedTokenizer: Tokenizer {
     let model: TokenizingModel
 
@@ -150,8 +173,11 @@ public class PreTrainedTokenizer: Tokenizer {
     private let normalizer: Normalizer?
     private let postProcessor: PostProcessor?
     private let decoder: Decoder?
+    private let tokenizerConfig: Config
 
     private let cleanUpTokenizationSpaces: Bool
+    
+    private let defaultChatTemplate: String = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
 
     required public init(tokenizerConfig: Config, tokenizerData: Config) throws {
         var addedTokens: [String : Int] = [:]
@@ -195,7 +221,8 @@ public class PreTrainedTokenizer: Tokenizer {
         self.postProcessor = PostProcessorFactory.fromConfig(config: tokenizerData.postProcessor)
         self.decoder = DecoderFactory.fromConfig(config: tokenizerData.decoder)
         self.cleanUpTokenizationSpaces = tokenizerConfig.cleanUpTokenizationSpaces?.boolValue ?? true
-
+        self.tokenizerConfig = tokenizerConfig
+        
         model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens)
     }
 
@@ -209,9 +236,9 @@ public class PreTrainedTokenizer: Tokenizer {
         return normalizer(text: text)
     }
 
-    func postProcess(_ tokens: [String]) -> [String] {
+    func postProcess(_ tokens: [String], addSpecialTokens: Bool = true) -> [String] {
         guard let postProcessor = postProcessor else { return tokens }
-        return postProcessor(tokens: tokens)
+        return postProcessor(tokens: tokens, addSpecialTokens: addSpecialTokens)
     }
 
     func decodeTokens(_ tokens: [String]) -> [String] {
@@ -265,8 +292,12 @@ public class PreTrainedTokenizer: Tokenizer {
     }
 
     /// Main entry point
+    public func encode(text: String, addSpecialTokens: Bool = true) -> [Int] {
+        return postProcess(tokenize(text: text), addSpecialTokens: addSpecialTokens).map { model.convertTokenToId($0)! }
+    }
+
     public func encode(text: String) -> [Int] {
-        return postProcess(tokenize(text: text)).map { model.convertTokenToId($0)! }
+        return encode(text: text, addSpecialTokens: true)
     }
 
     /// Decode
@@ -285,6 +316,43 @@ public class PreTrainedTokenizer: Tokenizer {
     public func convertIdToToken(_ id: Int) -> String? {
         model.convertIdToToken(id)
     }
+    
+    public func applyChatTemplate(messages: [[String: String]]) throws -> [Int] {
+        try applyChatTemplate(messages: messages, chatTemplate: nil, addGenerationPrompt: true, maxLength: nil)
+    }
+    
+    public func applyChatTemplate(
+        messages: [[String: String]],
+        chatTemplate: String?,
+        addGenerationPrompt: Bool = false,
+        truncation: Bool = false,
+        maxLength: Int?
+    ) throws -> [Int] {
+        let template = try Template(chatTemplate ?? tokenizerConfig.chatTemplate?.stringValue ?? defaultChatTemplate)
+        var context: [String: Any] = [
+            "messages": messages,
+            "add_generation_prompt": addGenerationPrompt
+        ]
+
+        // TODO: maybe keep NSString here
+        for (key, value) in tokenizerConfig.dictionary as [String : Any] {
+            if specialTokenAttributes.contains(key), !(value is NSNull) {
+                context[key] = value
+            }
+        }
+
+        let rendered = try template.render(context)
+        var encodedTokens = encode(text: rendered, addSpecialTokens: false)
+        var maxLength = maxLength ?? encodedTokens.count
+        maxLength = min(maxLength, tokenizerConfig.modelMaxLength?.intValue ?? maxLength)
+        if encodedTokens.count > maxLength {
+            if truncation {
+                encodedTokens = Array(encodedTokens.prefix(maxLength))
+            }
+        }
+
+        return encodedTokens
+    }
 }
 
 // MARK: - Building

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ class BPETokenizer: PreTrainedTokenizerModel {`
`71`	`71`	`self.unknownToken = nil`
`72`	`72`	`self.unknownTokenId = nil`
`73`	`73`	`}`
`74`		`-`
	`74`	`+`
`75`	`75`	`eosToken = tokenizerConfig.eosToken?.stringValue`
`76`	`76`	`eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]`
`77`	`77`