huggingface · pcuenca · Feb 27, 2025 · Feb 26, 2025 · Feb 27, 2025 · pcuenca
diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift
@@ -177,7 +177,7 @@ public class LanguageModelConfigurationFromHub {
         modelName: String,
         hubApi: HubApi = .shared
     ) async throws -> Configurations {
-        let filesToDownload = ["config.json", "tokenizer_config.json", "tokenizer.json"]
+        let filesToDownload = ["config.json", "tokenizer_config.json", "chat_template.json", "tokenizer.json"]
         let repo = Hub.Repo(id: modelName)
         let downloadedModelFolder = try await hubApi.snapshot(from: repo, matching: filesToDownload)
 
@@ -190,9 +190,32 @@ public class LanguageModelConfigurationFromHub {
     ) async throws -> Configurations {
         // Note tokenizerConfig may be nil (does not exist in all models)
         let modelConfig = try hubApi.configuration(fileURL: modelFolder.appending(path: "config.json"))
+        // First try to get the tokenizer_config.json
         let tokenizerConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer_config.json"))
+        // Check for chat_template.json, which contains the preferred chat template for vision language models
+        if let chatTemplateConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "chat_template.json")) {
+            // If chat_template.json exists and contains a chat_template field, use it to override the tokenizer_config
+            if let chatTemplate = chatTemplateConfig.chatTemplate?.stringValue {
+                var updatedConfig: Config
+                if var configDict = tokenizerConfig?.dictionary {
+                    // Override the chat template in the existing tokenizer config
+                    configDict["chat_template"] = chatTemplate
+                    updatedConfig = Config(configDict)
+                } else {
+                    // Create a new config with just the chat template
+                    updatedConfig = Config(["chat_template": chatTemplate])
+                }
+                let tokenizerVocab = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json"))
+                let configs = Configurations(
+                    modelConfig: modelConfig,
+                    tokenizerConfig: updatedConfig,
+                    tokenizerData: tokenizerVocab
+                )
+                return configs
+            }
+        }
+        // If chat_template.json doesn't exist or doesn't have a chat_template field, use the tokenizer_config as is
         let tokenizerVocab = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json"))
-
         let configs = Configurations(
             modelConfig: modelConfig,
             tokenizerConfig: tokenizerConfig,

diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift
@@ -178,6 +178,43 @@ What is the weather in Paris today?<|im_end|>
         XCTAssertTrue(tokenizer.hasChatTemplate)
     }
 
+    // Test for vision models with a vision chat template in chat_template.json
+    func testChatTemplateFromChatTemplateJson() async throws {
+        let visionMessages = [
+            [
+                "role": "user",
+                "content": [
+                    [
+                        "type": "text",
+                        "text": "What's in this image?",
+                    ] as [String: String],
+                    [
+                        "type": "image",
+                        "image_url": "example.jpg",
+                    ] as [String: String],
+                ] as [[String: String]],
+            ] as [String: Any]
+        ] as [[String: Any]]
+        // Qwen 2 VL does not have a chat_template.json file. The chat template is in tokenizer_config.json.
+        let qwen2VLTokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Qwen2-VL-7B-Instruct-4bit")
+        // Qwen 2.5 VL has a chat_template.json file with a different chat template than the one in tokenizer_config.json.
+        let qwen2_5VLTokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Qwen2.5-VL-7B-Instruct-4bit")
+        let qwen2VLEncoded = try qwen2VLTokenizer.applyChatTemplate(messages: visionMessages)
+        let qwen2VLDecoded = qwen2VLTokenizer.decode(tokens: qwen2VLEncoded)
+        let qwen2_5VLEncoded = try qwen2_5VLTokenizer.applyChatTemplate(messages: visionMessages)
+        let qwen2_5VLDecoded = qwen2_5VLTokenizer.decode(tokens: qwen2_5VLEncoded)
+        let expectedOutput = """
+<|im_start|>system
+You are a helpful assistant.<|im_end|>
+<|im_start|>user
+What's in this image?<|vision_start|><|image_pad|><|vision_end|><|im_end|>
+<|im_start|>assistant
+
+"""
+        XCTAssertTrue(qwen2VLEncoded == qwen2_5VLEncoded)
+        XCTAssertTrue(qwen2VLDecoded == qwen2_5VLDecoded && qwen2_5VLDecoded == expectedOutput)
-        XCTAssertTrue(qwen2VLEncoded == qwen2_5VLEncoded)
-        XCTAssertTrue(qwen2VLDecoded == qwen2_5VLDecoded && qwen2_5VLDecoded == expectedOutput)
+        XCTAssertEqual(qwen2VLEncoded, qwen2_5VLEncoded, "Encoded sequences should be equal")
+        XCTAssertEqual(qwen2VLDecoded, qwen2_5VLDecoded, "Decoded sequences should be equal")
+        XCTAssertEqual(qwen2_5VLDecoded, expectedOutput, "Decoded should match expected")
-        XCTAssertTrue(qwen2VLEncoded == qwen2_5VLEncoded)
-        XCTAssertTrue(qwen2VLDecoded == qwen2_5VLDecoded && qwen2_5VLDecoded == expectedOutput)
+        XCTAssertEqual(qwen2VLEncoded, qwen2_5VLEncoded, "Encoded sequences should be equal")
+        XCTAssertEqual(qwen2VLDecoded, qwen2_5VLDecoded, "Decoded sequences should be equal")
+        XCTAssertEqual(qwen2_5VLDecoded, expectedOutput, "Decoded should match expected")
+    }
+
     func testApplyTemplateError() async throws {
         let tokenizer = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased")
         XCTAssertFalse(tokenizer.hasChatTemplate)