Use chat template for Qwen 2 VL

DePasqualeOrg · DePasqualeOrg · commit 408e7a8813c7 · 2025-02-05T06:05:17.000Z
diff --git a/Applications/VLMEval/ContentView.swift b/Applications/VLMEval/ContentView.swift
@@ -383,9 +383,17 @@ class VLMEvaluator {
             MLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate * 1000))
 
             let result = try await modelContainer.perform { context in
-                let images: [UserInput.Image] = image != nil ? [.ciImage(image!)] : []
                 let videos: [UserInput.Video] = videoURL != nil ? [.url(videoURL!)] : []
-                var userInput = UserInput(prompt: prompt, images: images, videos: videos)
+                var userInput = UserInput(
+                    messages: [
+                        [
+                            "role": "user",
+                            "content": [
+                                ["type": "text", "text": prompt],
+                                ["type": "image"],
+                            ],
+                        ]
+                    ], images: [.ciImage(image)], videos: videos)
                 userInput.processing.resize = .init(width: 448, height: 448)
 
                 let input = try await context.processor.prepare(input: userInput)
diff --git a/Libraries/MLXLLM/LLMModelFactory.swift b/Libraries/MLXLLM/LLMModelFactory.swift
@@ -236,7 +236,7 @@ private struct LLMUserInputProcessor: UserInputProcessor {
             // but that is not public so just fall back to text
             let prompt = input.prompt
                 .asMessages()
-                .compactMap { $0["content"] }
+                .compactMap { $0["content"] as? String }
                 .joined(separator: ". ")
             let promptTokens = tokenizer.encode(text: prompt)
             return LMInput(tokens: MLXArray(promptTokens))
diff --git a/Libraries/MLXLMCommon/LanguageModel.swift b/Libraries/MLXLMCommon/LanguageModel.swift
@@ -70,13 +70,13 @@ public struct LMInput {
     public struct ProcessedImage {
 
         public let pixels: MLXArray
-        public let imageGridThw: [THW]?
+        public let frames: [THW]?
 
         public init(
-            pixels: MLXArray, imageGridThw: [THW]? = nil
+            pixels: MLXArray, frames: [THW]? = nil
         ) {
             self.pixels = pixels
-            self.imageGridThw = imageGridThw
+            self.frames = frames
         }
     }
 
diff --git a/Libraries/MLXLMCommon/UserInput.swift b/Libraries/MLXLMCommon/UserInput.swift
@@ -5,18 +5,19 @@ import CoreImage
 import Foundation
 import MLX
 
+public typealias Message = [String: Any]
+
 /// Container for raw user input.
 ///
 /// A ``UserInputProcessor`` can convert this to ``LMInput``.
 /// See also ``ModelContext``.
 public struct UserInput: Sendable {
-
     /// Representation of a prompt or series of messages (conversation).
     public enum Prompt: Sendable, CustomStringConvertible {
         case text(String)
-        case messages([[String: String]])
+        case messages([Message])
 
-        public func asMessages() -> [[String: String]] {
+        public func asMessages() -> [Message] {
             switch self {
             case .text(let text):
                 return [["role": "user", "content": text]]
@@ -133,7 +134,7 @@ public struct UserInput: Sendable {
         self.videos = videos
     }
 
-    public init(messages: [[String: String]], images: [Image] = [Image]()) {
+    public init(messages: [Message], images: [Image] = [Image]()) {
         self.prompt = .messages(messages)
         self.images = images
     }
diff --git a/Libraries/MLXVLM/Models/Idefics3.swift b/Libraries/MLXVLM/Models/Idefics3.swift
@@ -805,7 +805,7 @@ public class Idefics3Processor: UserInputProcessor {
     }
 
     public func prepare(input: UserInput) throws -> LMInput {
-        let prompt = input.prompt.asMessages().last?["content"] ?? ""
+        let prompt = input.prompt.asMessages().last?["content"] as? String ?? ""
 
         if input.images.isEmpty {
             // No image scenario
diff --git a/Libraries/MLXVLM/Models/Paligemma.swift b/Libraries/MLXVLM/Models/Paligemma.swift
@@ -478,7 +478,7 @@ public class PaligGemmaProcessor: UserInputProcessor {
         }
 
         // this doesn't have a chat template so just use the last message.
-        var prompt = input.prompt.asMessages().last?["content"] ?? ""
+        var prompt = input.prompt.asMessages().last?["content"] as? String ?? ""
 
         // based on transformers/processing_paligemma
         let count = input.images.count * config.imageSequenceLength
diff --git a/Libraries/MLXVLM/Models/Qwen2VL.swift b/Libraries/MLXVLM/Models/Qwen2VL.swift
@@ -367,10 +367,10 @@ private enum Vision {
         }
 
         public func callAsFunction(
-            _ x: MLXArray, gridThw: [THW], rotaryPositionEmbedding: MLXArray
+            _ x: MLXArray, frames: [THW], rotaryPositionEmbedding: MLXArray
         ) -> MLXArray {
             let sequenceLength = x.dim(0)
-            let B = gridThw[0].t
+            let B = frames[0].t
             let L = sequenceLength / B
 
             let qkv = qkv(x)
@@ -435,13 +435,13 @@ private enum Vision {
         }
 
         func callAsFunction(
-            _ hiddenStates: MLXArray, gridThw: [THW], rotaryPositionEmbedding: MLXArray
+            _ hiddenStates: MLXArray, frames: [THW], rotaryPositionEmbedding: MLXArray
         ) -> MLXArray {
             var hiddenStates =
                 hiddenStates
                 + attention(
                     norm1(hiddenStates),
-                    gridThw: gridThw,
+                    frames: frames,
                     rotaryPositionEmbedding: rotaryPositionEmbedding
                 )
             hiddenStates = hiddenStates + mlp(norm2(hiddenStates))
@@ -479,10 +479,10 @@ private enum Vision {
                 spatialMergeSize: 2)
         }
 
-        func rotaryPositionEmbedding(_ gridThw: [THW]) -> MLXArray {
+        func rotaryPositionEmbedding(_ frames: [THW]) -> MLXArray {
             var positionIds = [MLXArray]()
 
-            for row in gridThw {
+            for row in frames {
                 let (t, h, w) = row.values
 
                 var hposIds = expandedDimensions(MLXArray(0 ..< h), axis: 1)
@@ -516,22 +516,22 @@ private enum Vision {
             }
 
             let indices = concatenated(positionIds, axis: 0)
-            let maxGridSize = gridThw.lazy.map { max($0.h, $0.w) }.max() ?? 0
-            let rotaryPositionEmbedFull = rotaryPositionEmbedding(sequenceLength: maxGridSize)[
+            let maxFrameSize = frames.lazy.map { max($0.h, $0.w) }.max() ?? 0
+            let rotaryPositionEmbedFull = rotaryPositionEmbedding(sequenceLength: maxFrameSize)[
                 indices]
 
             return rotaryPositionEmbedFull.reshaped(indices.dim(0), -1)
         }
 
-        public func callAsFunction(_ hiddenStates: MLXArray, gridThw: [THW]) -> MLXArray {
+        public func callAsFunction(_ hiddenStates: MLXArray, frames: [THW]) -> MLXArray {
             var hiddenStates = patchEmbed(hiddenStates)
-            let rotaryPositionEmbedding = rotaryPositionEmbedding(gridThw)
+            let rotaryPositionEmbedding = rotaryPositionEmbedding(frames)
 
-            let batchSize = gridThw.count
+            let batchSize = frames.count
 
             for block in blocks {
                 hiddenStates = block(
-                    hiddenStates, gridThw: gridThw,
+                    hiddenStates, frames: frames,
                     rotaryPositionEmbedding: rotaryPositionEmbedding)
             }
 
@@ -585,6 +585,10 @@ private enum Vision {
 /// This is meant to be used with ``Qwen2VL`` and is typically created by ``VLMModelFactory``.
 public class Qwen2VLProcessor: UserInputProcessor {
 
+    enum Qwen2VLProcessorError: Error {
+        case framesIsNil
+    }
+
     private let config: Qwen2VLProcessorConfiguration
     private let tokenizer: any Tokenizer
 
@@ -739,61 +743,116 @@ public class Qwen2VLProcessor: UserInputProcessor {
             + "\n<|im_start|>assistant\n"
     }
 
-    public func prepare(input: UserInput) async throws -> LMInput {
-        if input.images.isEmpty && input.videos.isEmpty {
-            // just a straight text prompt
-            let prompt = prepare(prompt: input.prompt, imageTHW: nil, videoTHW: nil)
-            let promptTokens = try tokenizer.encode(text: prompt)
-            return LMInput(tokens: MLXArray(promptTokens))
+    private func prepareMessages(_ messages: [Message]) -> [Message] {
+        var messages = messages
+        // Add system message if not present
+        if let role = messages[0]["role"] as? String, role != "system" {
+            messages.insert(["role": "system", "content": "You are a helpful assistant."], at: 0)
         }
+        return messages
+    }
 
-        // image_processing_qwen2_vl.preprocess
-        let images = try input.images.map {
+    //    public func prepare(prompt: UserInput.Prompt, frames: [THW]?) throws -> String {
+    //        let messages = prepareMessages(prompt.asMessages())
+    //        let tokens = try tokenizer.applyChatTemplate(messages: messages)
+    //        return tokenizer.decode(tokens: tokens)
+    //    }
+
+    public func prepare(input: UserInput) throws -> LMInput {
+        // Text-only input
+        if input.images.isEmpty {
+            let messages = input.prompt.asMessages()
+            let promptTokens = try tokenizer.applyChatTemplate(messages: messages)
+            return LMInput(tokens: MLXArray(promptTokens))
+        }
+        // Input with images
+        let pixelsAndFrames = try input.images.map {
             try preprocess(images: [$0.asCIImage()], processing: input.processing)
         }
 
-        var videosAsImageSequences = [[CIImage]]()
-        for video in input.videos {
-            if let imageSequence = try? await MediaProcessing.asCIImageSequence(
-                video.asAVAsset(), samplesPerSecond: 2)
-            {
-                videosAsImageSequences.append(imageSequence)
-            }
+        // var videosAsImageSequences = [[CIImage]]()
+        // for video in input.videos {
+        //     if let imageSequence = try? await MediaProcessing.asCIImageSequence(
+        //         video.asAVAsset(), samplesPerSecond: 2)
+        //     {
+        //         videosAsImageSequences.append(imageSequence)
+        //     }
+        // }
+        // let videos = try videosAsImageSequences.map {
+        //     try preprocess(images: $0, processing: input.processing)
+        // }
+
+        // let imagePixels: MLXArray?
+        // let image: LMInput.ProcessedImage?
+        // if !images.isEmpty {
+        //     imagePixels = concatenated(images.map { $0.0 })
+        //     image = LMInput.ProcessedImage(pixels: imagePixels!, imageGridThw: images.map { $0.1 })
+        // } else {
+        //     imagePixels = nil
+        //     image = nil
+        // }
+
+        // let videoPixels: MLXArray?
+        // let video: LMInput.ProcessedVideo?
+        // if !videos.isEmpty {
+        //     videoPixels = concatenated(videos.map { $0.0 })
+        //     video = LMInput.ProcessedVideo(pixels: videoPixels!, videoGridThw: videos.map { $0.1 })
+        // } else {
+        //     videoPixels = nil
+        //     video = nil
+        // }
+
+        // // processing_qwen2_vl.Qwen2VLProcessor
+        // let prompt = prepare(
+        //     prompt: input.prompt, imageTHW: image?.imageGridThw, videoTHW: video?.videoGridThw)
+        // let promptTokens = try tokenizer.encode(text: prompt)
+        // let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
+        // let mask = ones(like: promptArray).asType(.int8)
+
+        // return LMInput(text: .init(tokens: promptArray, mask: mask), image: image, video: video)
+        let pixelsConcatenated = concatenated(pixelsAndFrames.map { $0.0 })
+        let image = LMInput.ProcessedImage(
+            pixels: pixelsConcatenated, frames: pixelsAndFrames.map { $0.1 })
+        let messages = prepareMessages(input.prompt.asMessages())
+        var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
+        // Replace single image pad token with correct number for each image
+        let mergeLength = config.mergeSize * config.mergeSize
+        let imagePlaceholderTokens = try tokenizer.encode(
+            text: "<|vision_start|><|image_pad|><|vision_end|>")
+        guard let frames = image.frames else {
+            throw Qwen2VLProcessorError.framesIsNil
         }
-        let videos = try videosAsImageSequences.map {
-            try preprocess(images: $0, processing: input.processing)
+        let placeholderRanges = promptTokens.ranges(of: imagePlaceholderTokens)
+        guard placeholderRanges.count == frames.count else {
+            throw VLMError.processing(
+                "Number of image placeholders does not match number of frames")
         }
-
-        let imagePixels: MLXArray?
-        let image: LMInput.ProcessedImage?
-        if !images.isEmpty {
-            imagePixels = concatenated(images.map { $0.0 })
-            image = LMInput.ProcessedImage(pixels: imagePixels!, imageGridThw: images.map { $0.1 })
-        } else {
-            imagePixels = nil
-            image = nil
+        let replacementSequences = try frames.map { thw in
+            let paddingCount = thw.product / mergeLength
+            return try tokenizer.encode(
+                text:
+                    "<|vision_start|>\(Array(repeating: "<|image_pad|>", count: paddingCount).joined())<|vision_end|>"
+            )
         }
-
-        let videoPixels: MLXArray?
-        let video: LMInput.ProcessedVideo?
-        if !videos.isEmpty {
-            videoPixels = concatenated(videos.map { $0.0 })
-            video = LMInput.ProcessedVideo(pixels: videoPixels!, videoGridThw: videos.map { $0.1 })
-        } else {
-            videoPixels = nil
-            video = nil
+        // Build the final array
+        var result: [Int] = []
+        var currentIndex = promptTokens.startIndex
+        for (range, replacement) in zip(placeholderRanges, replacementSequences) {
+            // Add tokens before the placeholder
+            result.append(contentsOf: promptTokens[currentIndex ..< range.lowerBound])
+            // Add replacement sequence
+            result.append(contentsOf: replacement)
+            currentIndex = range.upperBound
         }
-
-        // processing_qwen2_vl.Qwen2VLProcessor
-        let prompt = prepare(
-            prompt: input.prompt, imageTHW: image?.imageGridThw, videoTHW: video?.videoGridThw)
-        let promptTokens = try tokenizer.encode(text: prompt)
+        // Add any remaining tokens after the last replacement
+        if currentIndex < promptTokens.endIndex {
+            result.append(contentsOf: promptTokens[currentIndex...])
+        }
+        promptTokens = result
         let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
         let mask = ones(like: promptArray).asType(.int8)
-
-        return LMInput(text: .init(tokens: promptArray, mask: mask), image: image, video: video)
+        return LMInput(text: .init(tokens: promptArray, mask: mask), image: image)
     }
-
 }
 
 // MARK: - Model
@@ -821,18 +880,18 @@ public class Qwen2VL: Module, VLMModel, KVCacheDimensionProvider {
         self._languageModel.wrappedValue = Language.LanguageModel(config.textConfiguration)
     }
 
-    private func inputEmbeddings(inputIds: MLXArray, pixelValues: MLXArray?, gridThw: [THW]?)
+    private func inputEmbeddings(inputIds: MLXArray, pixelValues: MLXArray?, frames: [THW]?)
         -> MLXArray
     {
-        guard let pixelValues, let gridThw else {
+        guard let pixelValues, let frames else {
             return languageModel.model.embedTokens(inputIds[.newAxis, .ellipsis])
         }
 
         // Get the input embeddings from the language model
         let inputEmbeds = languageModel.model.embedTokens(inputIds)
 
         // Get the ouptut hidden states from the vision model
-        var hiddenStates = self.visionModel(pixelValues, gridThw: gridThw)
+        var hiddenStates = self.visionModel(pixelValues, frames: frames)
 
         if hiddenStates.ndim == 2 {
             hiddenStates = hiddenStates[.newAxis, 0..., 0...]
@@ -871,6 +930,8 @@ public class Qwen2VL: Module, VLMModel, KVCacheDimensionProvider {
     public func prepare(_ input: LMInput, cache: [any KVCache], windowSize: Int?) throws
         -> PrepareResult
     {
+        let frames = input.image?.frames
+
         let dtype = visionModel.patchEmbed.proj.weight.dtype
 
         let imageGridThw = input.image?.imageGridThw
@@ -891,7 +952,7 @@ public class Qwen2VL: Module, VLMModel, KVCacheDimensionProvider {
         }
 
         let inputEmbeddings = self.inputEmbeddings(
-            inputIds: input.text.tokens, pixelValues: pixels, gridThw: gridThw)
+            inputIds: input.text.tokens, pixelValues: pixels, frames: frames)
 
         let result = languageModel(nil, cache: cache, inputEmbedding: inputEmbeddings)
 
diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -11,6 +11,7 @@ public enum VLMError: Error {
     case maskRequired
     case singleImageAllowed
     case imageProcessingFailure(String)
+    case processing(String)
 }
 
 public struct BaseProcessorConfiguration: Codable, Sendable {

Original file line number	Diff line number	Diff line change
`@@ -70,13 +70,13 @@ public struct LMInput {`
`70`	`70`	`public struct ProcessedImage {`
`71`	`71`
`72`	`72`	`public let pixels: MLXArray`
`73`		`- public let imageGridThw: [THW]?`
	`73`	`+ public let frames: [THW]?`
`74`	`74`
`75`	`75`	`public init(`
`76`		`- pixels: MLXArray, imageGridThw: [THW]? = nil`
	`76`	`+ pixels: MLXArray, frames: [THW]? = nil`
`77`	`77`	`) {`
`78`	`78`	`self.pixels = pixels`
`79`		`- self.imageGridThw = imageGridThw`
	`79`	`+ self.frames = frames`
`80`	`80`	`}`
`81`	`81`	`}`
`82`	`82`
Original file line number	Diff line number	Diff line change
`@@ -805,7 +805,7 @@ public class Idefics3Processor: UserInputProcessor {`
`805`	`805`	`}`
`806`	`806`
`807`	`807`	`public func prepare(input: UserInput) throws -> LMInput {`
`808`		`- let prompt = input.prompt.asMessages().last?["content"] ?? ""`
	`808`	`+ let prompt = input.prompt.asMessages().last?["content"] as? String ?? ""`
`809`	`809`
`810`	`810`	`if input.images.isEmpty {`
`811`	`811`	`// No image scenario`
Original file line number	Diff line number	Diff line change
`@@ -478,7 +478,7 @@ public class PaligGemmaProcessor: UserInputProcessor {`
`478`	`478`	`}`
`479`	`479`
`480`	`480`	`// this doesn't have a chat template so just use the last message.`
`481`		`- var prompt = input.prompt.asMessages().last?["content"] ?? ""`
	`481`	`+ var prompt = input.prompt.asMessages().last?["content"] as? String ?? ""`
`482`	`482`
`483`	`483`	`// based on transformers/processing_paligemma`
`484`	`484`	`let count = input.images.count * config.imageSequenceLength`
Original file line number	Diff line number	Diff line change
`@@ -11,6 +11,7 @@ public enum VLMError: Error {`
`11`	`11`	`case maskRequired`
`12`	`12`	`case singleImageAllowed`
`13`	`13`	`case imageProcessingFailure(String)`
	`14`	`+ case processing(String)`
`14`	`15`	`}`
`15`	`16`
`16`	`17`	`public struct BaseProcessorConfiguration: Codable, Sendable {`