From da1e193d9d07e5bbc810e3b7c481488b90aff8cf Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Wed, 26 Feb 2025 09:27:50 +0100 Subject: [PATCH 1/2] Prefer chat_template.json for chat template --- Sources/Hub/Hub.swift | 27 +++++++++++++- Tests/TokenizersTests/ChatTemplateTests.swift | 37 +++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index 4116dcb..f8c7238 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -177,7 +177,7 @@ public class LanguageModelConfigurationFromHub { modelName: String, hubApi: HubApi = .shared ) async throws -> Configurations { - let filesToDownload = ["config.json", "tokenizer_config.json", "tokenizer.json"] + let filesToDownload = ["config.json", "tokenizer_config.json", "chat_template.json", "tokenizer.json"] let repo = Hub.Repo(id: modelName) let downloadedModelFolder = try await hubApi.snapshot(from: repo, matching: filesToDownload) @@ -190,9 +190,32 @@ public class LanguageModelConfigurationFromHub { ) async throws -> Configurations { // Note tokenizerConfig may be nil (does not exist in all models) let modelConfig = try hubApi.configuration(fileURL: modelFolder.appending(path: "config.json")) + // First try to get the tokenizer_config.json let tokenizerConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer_config.json")) + // Check for chat_template.json, which contains the preferred chat template for vision language models + if let chatTemplateConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "chat_template.json")) { + // If chat_template.json exists and contains a chat_template field, use it to override the tokenizer_config + if let chatTemplate = chatTemplateConfig.chatTemplate?.stringValue { + var updatedConfig: Config + if var configDict = tokenizerConfig?.dictionary { + // Override the chat template in the existing tokenizer config + configDict["chat_template"] = chatTemplate + updatedConfig = Config(configDict) + } else { + // Create a new config with just the chat template + updatedConfig = Config(["chat_template": chatTemplate]) + } + let tokenizerVocab = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json")) + let configs = Configurations( + modelConfig: modelConfig, + tokenizerConfig: updatedConfig, + tokenizerData: tokenizerVocab + ) + return configs + } + } + // If chat_template.json doesn't exist or doesn't have a chat_template field, use the tokenizer_config as is let tokenizerVocab = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json")) - let configs = Configurations( modelConfig: modelConfig, tokenizerConfig: tokenizerConfig, diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index 13d897d..ea3d07a 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -178,6 +178,43 @@ What is the weather in Paris today?<|im_end|> XCTAssertTrue(tokenizer.hasChatTemplate) } + // Test for vision models with a vision chat template in chat_template.json + func testChatTemplateFromChatTemplateJson() async throws { + let visionMessages = [ + [ + "role": "user", + "content": [ + [ + "type": "text", + "text": "What's in this image?", + ] as [String: String], + [ + "type": "image", + "image_url": "example.jpg", + ] as [String: String], + ] as [[String: String]], + ] as [String: Any] + ] as [[String: Any]] + // Qwen 2 VL does not have a chat_template.json file. The chat template is in tokenizer_config.json. + let qwen2VLTokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Qwen2-VL-7B-Instruct-4bit") + // Qwen 2.5 VL has a chat_template.json file with a different chat template than the one in tokenizer_config.json. + let qwen2_5VLTokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Qwen2.5-VL-7B-Instruct-4bit") + let qwen2VLEncoded = try qwen2VLTokenizer.applyChatTemplate(messages: visionMessages) + let qwen2VLDecoded = qwen2VLTokenizer.decode(tokens: qwen2VLEncoded) + let qwen2_5VLEncoded = try qwen2_5VLTokenizer.applyChatTemplate(messages: visionMessages) + let qwen2_5VLDecoded = qwen2_5VLTokenizer.decode(tokens: qwen2_5VLEncoded) + let expectedOutput = """ +<|im_start|>system +You are a helpful assistant.<|im_end|> +<|im_start|>user +What's in this image?<|vision_start|><|image_pad|><|vision_end|><|im_end|> +<|im_start|>assistant + +""" + XCTAssertTrue(qwen2VLEncoded == qwen2_5VLEncoded) + XCTAssertTrue(qwen2VLDecoded == qwen2_5VLDecoded && qwen2_5VLDecoded == expectedOutput) + } + func testApplyTemplateError() async throws { let tokenizer = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") XCTAssertFalse(tokenizer.hasChatTemplate) From 4bebf9e1c0d4e5732092070d193b08e7faaab195 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Thu, 27 Feb 2025 09:27:01 +0100 Subject: [PATCH 2/2] Refinements --- Sources/Hub/Hub.swift | 46 +++++++------------ Tests/TokenizersTests/ChatTemplateTests.swift | 5 +- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/Sources/Hub/Hub.swift b/Sources/Hub/Hub.swift index f8c7238..b303736 100644 --- a/Sources/Hub/Hub.swift +++ b/Sources/Hub/Hub.swift @@ -183,45 +183,33 @@ public class LanguageModelConfigurationFromHub { return try await loadConfig(modelFolder: downloadedModelFolder, hubApi: hubApi) } - + func loadConfig( modelFolder: URL, hubApi: HubApi = .shared ) async throws -> Configurations { - // Note tokenizerConfig may be nil (does not exist in all models) + // Load required configurations let modelConfig = try hubApi.configuration(fileURL: modelFolder.appending(path: "config.json")) - // First try to get the tokenizer_config.json - let tokenizerConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer_config.json")) - // Check for chat_template.json, which contains the preferred chat template for vision language models - if let chatTemplateConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "chat_template.json")) { - // If chat_template.json exists and contains a chat_template field, use it to override the tokenizer_config - if let chatTemplate = chatTemplateConfig.chatTemplate?.stringValue { - var updatedConfig: Config - if var configDict = tokenizerConfig?.dictionary { - // Override the chat template in the existing tokenizer config - configDict["chat_template"] = chatTemplate - updatedConfig = Config(configDict) - } else { - // Create a new config with just the chat template - updatedConfig = Config(["chat_template": chatTemplate]) - } - let tokenizerVocab = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json")) - let configs = Configurations( - modelConfig: modelConfig, - tokenizerConfig: updatedConfig, - tokenizerData: tokenizerVocab - ) - return configs + let tokenizerData = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json")) + // Load tokenizer config + var tokenizerConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer_config.json")) + // Check for chat template and merge if available + if let chatTemplateConfig = try? hubApi.configuration(fileURL: modelFolder.appending(path: "chat_template.json")), + let chatTemplate = chatTemplateConfig.chatTemplate?.stringValue { + // The value of chat_template could also be an array of strings, but we're not handling that case here, since it's discouraged. + // Create or update tokenizer config with chat template + if var configDict = tokenizerConfig?.dictionary { + configDict["chat_template"] = chatTemplate + tokenizerConfig = Config(configDict) + } else { + tokenizerConfig = Config(["chat_template": chatTemplate]) } } - // If chat_template.json doesn't exist or doesn't have a chat_template field, use the tokenizer_config as is - let tokenizerVocab = try hubApi.configuration(fileURL: modelFolder.appending(path: "tokenizer.json")) - let configs = Configurations( + return Configurations( modelConfig: modelConfig, tokenizerConfig: tokenizerConfig, - tokenizerData: tokenizerVocab + tokenizerData: tokenizerData ) - return configs } static func fallbackTokenizerConfig(for modelType: String) -> Config? { diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index ea3d07a..88e1843 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -211,8 +211,9 @@ What's in this image?<|vision_start|><|image_pad|><|vision_end|><|im_end|> <|im_start|>assistant """ - XCTAssertTrue(qwen2VLEncoded == qwen2_5VLEncoded) - XCTAssertTrue(qwen2VLDecoded == qwen2_5VLDecoded && qwen2_5VLDecoded == expectedOutput) + XCTAssertEqual(qwen2VLEncoded, qwen2_5VLEncoded, "Encoded sequences should be equal") + XCTAssertEqual(qwen2VLDecoded, qwen2_5VLDecoded, "Decoded sequences should be equal") + XCTAssertEqual(qwen2_5VLDecoded, expectedOutput, "Decoded sequence should match expected output") } func testApplyTemplateError() async throws {