ml-explore · davidkoski · Feb 12, 2025 · Jan 26, 2025 · Jan 26, 2025 · Feb 5, 2025
diff --git a/Applications/VLMEval/ContentView.swift b/Applications/VLMEval/ContentView.swift
@@ -383,9 +383,33 @@ class VLMEvaluator {
             MLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate * 1000))
 
             let result = try await modelContainer.perform { context in
-                let images: [UserInput.Image] = image != nil ? [.ciImage(image!)] : []
-                let videos: [UserInput.Video] = videoURL != nil ? [.url(videoURL!)] : []
-                var userInput = UserInput(prompt: prompt, images: images, videos: videos)
+                let images: [UserInput.Image] =
+                    if let image {
+                        [UserInput.Image.ciImage(image)]
+                    } else {
+                        []
+                    }
+                let videos: [UserInput.Video] =
+                    if let videoURL {
+                        [.url(videoURL)]
+                    } else {
+                        []
+                    }
+                var userInput = UserInput(
+                    messages: [
+                        [
+                            "role": "user",
+                            "content": [
+                                ["type": "text", "text": prompt]
+                            ]
+                                + images.map { _ in
+                                    ["type": "image"]
+                                }
+                                + videos.map { _ in
+                                    ["type": "video"]
+                                },
+                        ]
+                    ], images: images, videos: videos)
                 userInput.processing.resize = .init(width: 448, height: 448)
 
                 let input = try await context.processor.prepare(input: userInput)

diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -249,7 +249,7 @@ private struct LLMUserInputProcessor: UserInputProcessor {
             // but that is not public so just fall back to text
             let prompt = input.prompt
                 .asMessages()
-                .compactMap { $0["content"] }
+                .compactMap { $0["content"] as? String }
                 .joined(separator: ". ")
             let promptTokens = tokenizer.encode(text: prompt)
             return LMInput(tokens: MLXArray(promptTokens))

diff --git a/Libraries/MLXLMCommon/LanguageModel.swift b/Libraries/MLXLMCommon/LanguageModel.swift
@@ -69,14 +69,16 @@ public struct LMInput {
     /// Representation of prepared input image(s).
     public struct ProcessedImage {
 
+        /// Concatenated pixels from one or more images
         public let pixels: MLXArray
-        public let imageGridThw: [THW]?
+        /// Time, height, and width of the images
+        public let frames: [THW]?
 
         public init(
-            pixels: MLXArray, imageGridThw: [THW]? = nil
+            pixels: MLXArray, frames: [THW]? = nil
         ) {
             self.pixels = pixels
-            self.imageGridThw = imageGridThw
+            self.frames = frames
         }
     }
 
@@ -85,13 +87,13 @@ public struct LMInput {
     public struct ProcessedVideo {
 
         public let pixels: MLXArray
-        public let videoGridThw: [THW]?
+        public let frames: [THW]?
 
         public init(
-            pixels: MLXArray, videoGridThw: [THW]? = nil
+            pixels: MLXArray, frames: [THW]? = nil
         ) {
             self.pixels = pixels
-            self.videoGridThw = videoGridThw
+            self.frames = frames
         }
     }
 

diff --git a/Libraries/MLXLMCommon/UserInput.swift b/Libraries/MLXLMCommon/UserInput.swift
@@ -6,18 +6,19 @@ import Foundation
 import MLX
 import Tokenizers
 
+public typealias Message = [String: Any]
+
 /// Container for raw user input.
 ///
 /// A ``UserInputProcessor`` can convert this to ``LMInput``.
 /// See also ``ModelContext``.
 public struct UserInput: Sendable {
-
     /// Representation of a prompt or series of messages (conversation).
     public enum Prompt: Sendable, CustomStringConvertible {
         case text(String)
-        case messages([[String: String]])
+        case messages([Message])
 
-        public func asMessages() -> [[String: String]] {
+        public func asMessages() -> [Message] {
             switch self {
             case .text(let text):
                 return [["role": "user", "content": text]]
@@ -144,11 +145,13 @@ public struct UserInput: Sendable {
     }
 
     public init(
-        messages: [[String: String]], images: [Image] = [Image](), tools: [ToolSpec]? = nil,
+        messages: [Message], images: [Image] = [Image](), videos: [Video] = [Video](),
+        tools: [ToolSpec]? = nil,
         additionalContext: [String: Any]? = nil
     ) {
         self.prompt = .messages(messages)
         self.images = images
+        self.videos = videos
         self.tools = tools
         self.additionalContext = additionalContext
     }

diff --git a/Libraries/MLXVLM/Models/Idefics3.swift b/Libraries/MLXVLM/Models/Idefics3.swift
@@ -805,7 +805,7 @@ public class Idefics3Processor: UserInputProcessor {
     }
 
     public func prepare(input: UserInput) throws -> LMInput {
-        let prompt = input.prompt.asMessages().last?["content"] ?? ""
+        let prompt = input.prompt.asMessages().last?["content"] as? String ?? ""
 
         if input.images.isEmpty {
             // No image scenario

diff --git a/Libraries/MLXVLM/Models/Paligemma.swift b/Libraries/MLXVLM/Models/Paligemma.swift
@@ -478,7 +478,7 @@ public class PaligGemmaProcessor: UserInputProcessor {
         }
 
         // this doesn't have a chat template so just use the last message.
-        var prompt = input.prompt.asMessages().last?["content"] ?? ""
+        var prompt = input.prompt.asMessages().last?["content"] as? String ?? ""
 
         // based on transformers/processing_paligemma
         let count = input.images.count * config.imageSequenceLength