support for running all the models (#317)

davidkoski · web-flow · commit 8e41311a3c17 · 2025-06-06T10:20:04.000-07:00
* support for running all the models

- add a --download argument
- add a `list` command
- support scripts to write a script to run all the models

* updates for models / templates that do not accept system role
diff --git a/Libraries/MLXLLM/LLMModel.swift b/Libraries/MLXLLM/LLMModel.swift
@@ -2,9 +2,15 @@
 
 import MLX
 import MLXLMCommon
+import Tokenizers
 
 /// Marker protocol for LLMModels
 public protocol LLMModel: LanguageModel, LoRAModel {
+
+    /// Models can implement this is they need a custom `MessageGenerator`.
+    ///
+    /// The default implementation returns `DefaultMessageGenerator`.
+    func messageGenerator(tokenizer: Tokenizer) -> MessageGenerator
 }
 
 extension LLMModel {
@@ -30,4 +36,8 @@ extension LLMModel {
 
         return .tokens(y)
     }
+
+    public func messageGenerator(tokenizer: Tokenizer) -> MessageGenerator {
+        DefaultMessageGenerator()
+    }
 }
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -331,11 +331,18 @@ public class LLMModelFactory: ModelFactory {
 
         let tokenizer = try await loadTokenizer(configuration: configuration, hub: hub)
 
+        let messageGenerator =
+            if let model = model as? LLMModel {
+                model.messageGenerator(tokenizer: tokenizer)
+            } else {
+                DefaultMessageGenerator()
+            }
+
         return .init(
             configuration: configuration, model: model,
             processor: LLMUserInputProcessor(
                 tokenizer: tokenizer, configuration: configuration,
-                messageGenerator: DefaultMessageGenerator()),
+                messageGenerator: messageGenerator),
             tokenizer: tokenizer)
     }
 
diff --git a/Libraries/MLXLLM/Models/Gemma.swift b/Libraries/MLXLLM/Models/Gemma.swift
@@ -4,6 +4,7 @@ import Foundation
 import MLX
 import MLXLMCommon
 import MLXNN
+import Tokenizers
 
 // Port of https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/models/gemma.py
 
@@ -187,6 +188,10 @@ public class GemmaModel: Module, LLMModel, KVCacheDimensionProvider {
         let out = model(inputs, cache: cache)
         return model.embedTokens.asLinear(out)
     }
+
+    public func messageGenerator(tokenizer: any Tokenizer) -> any MessageGenerator {
+        NoSystemMessageGenerator()
+    }
 }
 
 public struct GemmaConfiguration: Codable, Sendable {
diff --git a/Libraries/MLXLLM/Models/Gemma2.swift b/Libraries/MLXLLM/Models/Gemma2.swift
@@ -4,6 +4,7 @@ import Foundation
 import MLX
 import MLXLMCommon
 import MLXNN
+import Tokenizers
 
 // Port of https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/models/gemma2.py
 
@@ -212,6 +213,10 @@ public class Gemma2Model: Module, LLMModel, KVCacheDimensionProvider {
         out = tanh(out / logitSoftCap) * logitSoftCap
         return out
     }
+
+    public func messageGenerator(tokenizer: any Tokenizer) -> any MessageGenerator {
+        NoSystemMessageGenerator()
+    }
 }
 
 public struct Gemma2Configuration: Codable {
@@ -245,7 +250,7 @@ public struct Gemma2Configuration: Codable {
         case queryPreAttnScalar = "query_pre_attn_scalar"
     }
 
-    public init(from decoder: Decoder) throws {
+    public init(from decoder: Swift.Decoder) throws {
         // Custom implementation to handle optional keys with required values
         let container: KeyedDecodingContainer<CodingKeys> = try decoder.container(
             keyedBy: CodingKeys.self)
diff --git a/Libraries/MLXLLM/Models/Llama.swift b/Libraries/MLXLLM/Models/Llama.swift
@@ -4,6 +4,7 @@ import Foundation
 import MLX
 import MLXLMCommon
 import MLXNN
+import Tokenizers
 
 // port of https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/models/llama.py
 
@@ -316,6 +317,23 @@ public class LlamaModel: Module, LLMModel, KVCacheDimensionProvider {
             !$0.key.contains("self_attn.rotary_emb.inv_freq")
         }
     }
+
+    public func messageGenerator(tokenizer: any Tokenizer) -> any MessageGenerator {
+        // some models allow the system role and some do not -- this is enforced
+        // by the chat template (code).
+        do {
+            let probe = [
+                [
+                    "role": "system",
+                    "content": "test",
+                ]
+            ]
+            _ = try tokenizer.applyChatTemplate(messages: probe)
+            return DefaultMessageGenerator()
+        } catch {
+            return NoSystemMessageGenerator()
+        }
+    }
 }
 
 public struct LlamaConfiguration: Codable, Sendable {
@@ -382,7 +400,7 @@ public struct LlamaConfiguration: Codable, Sendable {
         case mlpBias = "mlp_bias"
     }
 
-    public init(from decoder: Decoder) throws {
+    public init(from decoder: Swift.Decoder) throws {
         let container = try decoder.container(keyedBy: CodingKeys.self)
 
         hiddenSize = try container.decode(Int.self, forKey: .hiddenSize)
diff --git a/Libraries/MLXLMCommon/Chat.swift b/Libraries/MLXLMCommon/Chat.swift
@@ -63,12 +63,25 @@ public enum Chat {
 /// ```
 public protocol MessageGenerator {
 
+    /// Generates messages from the input.
+    func generate(from input: UserInput) -> [Message]
+
+    /// Returns array of `[String: Any]` aka ``Message``
+    func generate(messages: [Chat.Message]) -> [Message]
+
     /// Returns `[String: Any]` aka ``Message``.
     func generate(message: Chat.Message) -> Message
 }
 
 extension MessageGenerator {
-    /// Returns array of `[String: Any]` aka ``Message``
+
+    public func generate(message: Chat.Message) -> Message {
+        [
+            "role": message.role.rawValue,
+            "content": message.content,
+        ]
+    }
+
     public func generate(messages: [Chat.Message]) -> [Message] {
         var rawMessages: [Message] = []
 
@@ -80,7 +93,6 @@ extension MessageGenerator {
         return rawMessages
     }
 
-    /// Generates messages from the input.
     public func generate(from input: UserInput) -> [Message] {
         switch input.prompt {
         case .text(let text):
@@ -112,3 +124,22 @@ public struct DefaultMessageGenerator: MessageGenerator {
         ]
     }
 }
+
+/// Implementation of ``MessageGenerator`` that produces a
+/// `role` and `content` but omits `system` roles.
+///
+/// ```swift
+/// [
+///     "role": message.role.rawValue,
+///     "content": message.content,
+/// ]
+/// ```
+public struct NoSystemMessageGenerator: MessageGenerator {
+    public init() {}
+
+    public func generate(messages: [Chat.Message]) -> [Message] {
+        messages
+            .filter { $0.role != .system }
+            .map { generate(message: $0) }
+    }
+}
diff --git a/Tools/llm-tool/LLMTool.swift b/Tools/llm-tool/LLMTool.swift
@@ -14,7 +14,10 @@ import Tokenizers
 struct LLMTool: AsyncParsableCommand {
     static let configuration = CommandConfiguration(
         abstract: "Command line tool for generating text and manipulating LLMs",
-        subcommands: [EvaluateCommand.self, ChatCommand.self, LoRACommand.self],
+        subcommands: [
+            EvaluateCommand.self, ChatCommand.self, LoRACommand.self,
+            ListCommands.self,
+        ],
         defaultSubcommand: EvaluateCommand.self)
 }
 
@@ -24,6 +27,9 @@ struct ModelArguments: ParsableArguments, Sendable {
     @Option(name: .long, help: "Name of the Hugging Face model or absolute path to directory")
     var model: String?
 
+    @Option(help: "Hub download directory")
+    var download: URL?
+
     @Sendable
     func load(defaultModel: String, modelFactory: ModelFactory) async throws -> ModelContainer {
         let modelConfiguration: ModelConfiguration
@@ -39,7 +45,15 @@ struct ModelArguments: ParsableArguments, Sendable {
             // identifier
             modelConfiguration = modelFactory.configuration(id: modelName)
         }
-        return try await modelFactory.loadContainer(configuration: modelConfiguration)
+
+        let hub =
+            if let download {
+                HubApi(downloadBase: download)
+            } else {
+                HubApi()
+            }
+
+        return try await modelFactory.loadContainer(hub: hub, configuration: modelConfiguration)
     }
 }
 
@@ -313,6 +327,10 @@ struct EvaluateCommand: AsyncParsableCommand {
             return try await generate.generate(input: input, context: context)
         }
 
+        // wait for any asynchronous cleanup, e.g. tearing down compiled functions
+        // before the task exits -- this would race with mlx::core shutdown
+        try await Task.sleep(for: .milliseconds(30))
+
         if !generate.quiet {
             print("------")
             print(result.summary())
diff --git a/Tools/llm-tool/ListCommands.swift b/Tools/llm-tool/ListCommands.swift
@@ -0,0 +1,51 @@
+// Copyright © 2025 Apple Inc.
+
+import ArgumentParser
+import Foundation
+import MLXLLM
+import MLXVLM
+
+struct ListCommands: AsyncParsableCommand {
+
+    static let configuration = CommandConfiguration(
+        commandName: "list",
+        abstract: "list registered model configurations",
+        subcommands: [
+            ListLLMCommand.self, ListVLMCommand.self,
+        ]
+    )
+}
+
+struct ListLLMCommand: AsyncParsableCommand {
+
+    static let configuration = CommandConfiguration(
+        commandName: "llms",
+        abstract: "List registered LLM model configurations"
+    )
+
+    func run() async throws {
+        for configuration in LLMRegistry.shared.models {
+            switch configuration.id {
+            case .id(let id): print(id)
+            case .directory: break
+            }
+        }
+    }
+}
+
+struct ListVLMCommand: AsyncParsableCommand {
+
+    static let configuration = CommandConfiguration(
+        commandName: "vlms",
+        abstract: "List registered VLM model configurations"
+    )
+
+    func run() async throws {
+        for configuration in VLMRegistry.shared.models {
+            switch configuration.id {
+            case .id(let id): print(id)
+            case .directory: break
+            }
+        }
+    }
+}
diff --git a/mlx-swift-examples.xcodeproj/project.pbxproj b/mlx-swift-examples.xcodeproj/project.pbxproj
@@ -57,6 +57,7 @@
 		C36BF0082BC5CE56002D4AFE /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C36BF0072BC5CE56002D4AFE /* Assets.xcassets */; };
 		C36BF00C2BC5CE56002D4AFE /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = C36BF00B2BC5CE56002D4AFE /* Preview Assets.xcassets */; };
 		C36BF0352BC70F11002D4AFE /* Arguments.swift in Sources */ = {isa = PBXBuildFile; fileRef = C36BF0342BC70F11002D4AFE /* Arguments.swift */; };
+		C37133A22DD6524B00D19830 /* ListCommands.swift in Sources */ = {isa = PBXBuildFile; fileRef = C37133A12DD6524B00D19830 /* ListCommands.swift */; };
 		C38BA3AA2DB8321600BAFA88 /* Chat.swift in Sources */ = {isa = PBXBuildFile; fileRef = C38BA3A92DB8321600BAFA88 /* Chat.swift */; };
 		C392737D2B606A1D00368D5D /* Tutorial.swift in Sources */ = {isa = PBXBuildFile; fileRef = C392737C2B606A1D00368D5D /* Tutorial.swift */; };
 		C397C59C2B62C6D0004B084D /* ArgumentParser in Frameworks */ = {isa = PBXBuildFile; productRef = C397C59B2B62C6D0004B084D /* ArgumentParser */; };
@@ -232,6 +233,7 @@
 		C36BF0092BC5CE56002D4AFE /* StableDiffusionExample.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = StableDiffusionExample.entitlements; sourceTree = "<group>"; };
 		C36BF00B2BC5CE56002D4AFE /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 		C36BF0342BC70F11002D4AFE /* Arguments.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Arguments.swift; sourceTree = "<group>"; };
+		C37133A12DD6524B00D19830 /* ListCommands.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ListCommands.swift; sourceTree = "<group>"; };
 		C38BA3A92DB8321600BAFA88 /* Chat.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Chat.swift; sourceTree = "<group>"; };
 		C39273742B606A0A00368D5D /* Tutorial */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = Tutorial; sourceTree = BUILT_PRODUCTS_DIR; };
 		C392737C2B606A1D00368D5D /* Tutorial.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Tutorial.swift; sourceTree = "<group>"; };
@@ -481,6 +483,7 @@
 				C36BEFB32BBDEA69002D4AFE /* LoraCommands.swift */,
 				C36BEFB62BBDECBC002D4AFE /* Arguments.swift */,
 				C38BA3A92DB8321600BAFA88 /* Chat.swift */,
+				C37133A12DD6524B00D19830 /* ListCommands.swift */,
 			);
 			path = "llm-tool";
 			sourceTree = "<group>";
@@ -1148,6 +1151,7 @@
 				C38BA3AA2DB8321600BAFA88 /* Chat.swift in Sources */,
 				C34E48F52B696F0B00FCB841 /* LLMTool.swift in Sources */,
 				C36BEFB52BBDEAD8002D4AFE /* LoraCommands.swift in Sources */,
+				C37133A22DD6524B00D19830 /* ListCommands.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
diff --git a/support/generate-run-all-llms.sh b/support/generate-run-all-llms.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+echo "#!/bin/sh"
+echo "# NOTE: GENERATED BY generate-run-all-llms.sh -- DO NOT MODIFY BY HAND"
+
+./mlx-run llm-tool list llms | \
+	awk '{printf "./mlx-run llm-tool eval --download ~/Downloads/huggingface --model %s\n", $0}' | \
+	awk '{printf "echo\necho ======\necho '\''%s'\''\n%s\n", $0, $0}'
+
+./mlx-run llm-tool list vlms | \
+	awk '{printf "./mlx-run llm-tool eval --download ~/Downloads/huggingface --model %s --resize 512 --image support/test.jpg\n", $0}' | \
+	awk '{printf "echo\necho ======\necho '\''%s'\''\n%s\n", $0, $0}'
diff --git a/support/run-all-llms.sh b/support/run-all-llms.sh
diff --git a/support/test.jpg b/support/test.jpg