Working on integrating videos

DePasqualeOrg · DePasqualeOrg · commit f1f67bf87d10 · 2025-02-07T23:17:25.000Z
diff --git a/Applications/VLMEval/ContentView.swift b/Applications/VLMEval/ContentView.swift
@@ -383,17 +383,33 @@ class VLMEvaluator {
             MLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate * 1000))
 
             let result = try await modelContainer.perform { context in
-                let videos: [UserInput.Video] = videoURL != nil ? [.url(videoURL!)] : []
+                let images: [UserInput.Image] =
+                    if let image {
+                        [UserInput.Image.ciImage(image)]
+                    } else {
+                        []
+                    }
+                let videos: [UserInput.Video] =
+                    if let videoURL {
+                        [.url(videoURL)]
+                    } else {
+                        []
+                    }
                 var userInput = UserInput(
                     messages: [
                         [
                             "role": "user",
                             "content": [
-                                ["type": "text", "text": prompt],
-                                ["type": "image"],
-                            ],
+                                ["type": "text", "text": prompt]
+                            ]
+                                + images.map { _ in
+                                    ["type": "image"]
+                                }
+                                + videos.map { _ in
+                                    ["type": "video"]
+                                },
                         ]
-                    ], images: [.ciImage(image)], videos: videos)
+                    ], images: images, videos: videos)
                 userInput.processing.resize = .init(width: 448, height: 448)
 
                 let input = try await context.processor.prepare(input: userInput)
diff --git a/Libraries/MLXLMCommon/LanguageModel.swift b/Libraries/MLXLMCommon/LanguageModel.swift
@@ -69,7 +69,9 @@ public struct LMInput {
     /// Representation of prepared input image(s).
     public struct ProcessedImage {
 
+        /// Concatenated pixels from one or more images
         public let pixels: MLXArray
+        /// Time, height, and width of the images
         public let frames: [THW]?
 
         public init(
@@ -85,13 +87,13 @@ public struct LMInput {
     public struct ProcessedVideo {
 
         public let pixels: MLXArray
-        public let videoGridThw: [THW]?
+        public let frames: [THW]?
 
         public init(
             pixels: MLXArray, videoGridThw: [THW]? = nil
         ) {
             self.pixels = pixels
-            self.videoGridThw = videoGridThw
+            self.frames = videoGridThw
         }
     }
 
diff --git a/Libraries/MLXLMCommon/UserInput.swift b/Libraries/MLXLMCommon/UserInput.swift
@@ -134,9 +134,10 @@ public struct UserInput: Sendable {
         self.videos = videos
     }
 
-    public init(messages: [Message], images: [Image] = [Image]()) {
+    public init(messages: [Message], images: [Image] = [Image](), videos: [Video] = [Video]()) {
         self.prompt = .messages(messages)
         self.images = images
+        self.videos = videos
     }
 
     public init(prompt: Prompt, images: [Image] = [Image](), processing: Processing = .init()) {
diff --git a/Libraries/MLXVLM/Models/Qwen2VL.swift b/Libraries/MLXVLM/Models/Qwen2VL.swift
@@ -694,147 +694,83 @@ public class Qwen2VLProcessor: UserInputProcessor {
         return (flattenedPatches, .init(gridT, gridH, gridW))
     }
 
-    public func prepare(prompt: UserInput.Prompt, imageTHW: [THW]?, videoTHW: [THW]?) -> String {
-        // the tokenizer does have a chat template and it expects messages
-        // like this:
-        //
-        // [{'role': 'user', 'content': [{'type': 'text', 'text': 'What are these?'},
-        //  {'type': 'image'}, {'type': 'image'}, {'type': 'image'}]}]
-        //
-        // The output of the prompt template is fed into
-        // image_processing_qwen2_vl.preprocess where it is further augmented
-        // by replacing tokens according to imageTHW.
-        //
-        // Neither the structured content nor the postprocessing of the template
-        // are supported in current Tokenizer/Jinja (swift) so handle that here.
-
-        var messages = prompt.asMessages()
-        if messages[0]["role"] != "system" {
-            messages.insert(["role": "system", "content": "You are a helpful assistant."], at: 0)
+    public func prepare(input: UserInput) async throws -> LMInput {
+        let messages = input.prompt.asMessages()
+        var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
+        // Text-only input
+        if input.images.isEmpty, input.videos.isEmpty {
+            return LMInput(tokens: MLXArray(promptTokens))
         }
-
-        let lastIndex = messages.count - 1
-        var lastMessage = messages[lastIndex]["content"] ?? ""
-
-        // image_processing_qwen2_vl.preprocess -- inject image_pad tokens for each image
-        let mergeLength = config.mergeSize * config.mergeSize
-        for thw in imageTHW ?? [] {
-            lastMessage += "<|vision_start|>"
-            lastMessage += Array(repeating: "<|image_pad|>", count: thw.product / mergeLength)
-                .joined()
-            lastMessage += "<|vision_end|>"
+        // Input with images and/or videos
+        // Image processing
+        let imagePixelsAndFrames = try input.images.map {
+            try preprocess(images: [$0.asCIImage()], processing: input.processing)
         }
-
-        for thw in videoTHW ?? [] {
-            lastMessage += "<|vision_start|>"
-            lastMessage += Array(repeating: "<|video_pad|>", count: thw.product / mergeLength)
-                .joined()
-            lastMessage += "<|vision_end|>"
+        let processedImage: LMInput.ProcessedImage?
+        if !imagePixelsAndFrames.isEmpty {
+            let imagePixelsConcatenated = concatenated(imagePixelsAndFrames.map { $0.0 })
+            processedImage = LMInput.ProcessedImage(
+                pixels: imagePixelsConcatenated, frames: imagePixelsAndFrames.map { $0.1 })
+            if let imageFrames = processedImage?.frames {
+                // Replace padding for images
+                promptTokens = try replacePlaceholderTokens(
+                    in: promptTokens, frames: imageFrames, paddingToken: "<|image_pad|>")
+            }
+        } else {
+            processedImage = nil
         }
-
-        messages[lastIndex]["content"] = lastMessage
-
-        return
-            messages
-            .map {
-                "<|im_start|>\($0["role"] ?? "user")\n\($0["content"] ?? "")<|im_end|>"
+        // Video processing
+        var videosAsImageSequences = [[CIImage]]()
+        for video in input.videos {
+            if let imageSequence = try? await MediaProcessing.asCIImageSequence(
+                video.asAVAsset(), samplesPerSecond: 2)
+            {
+                videosAsImageSequences.append(imageSequence)
             }
-            .joined(separator: "\n")
-            + "\n<|im_start|>assistant\n"
-    }
-
-    private func prepareMessages(_ messages: [Message]) -> [Message] {
-        var messages = messages
-        // Add system message if not present
-        if let role = messages[0]["role"] as? String, role != "system" {
-            messages.insert(["role": "system", "content": "You are a helpful assistant."], at: 0)
         }
-        return messages
-    }
-
-    //    public func prepare(prompt: UserInput.Prompt, frames: [THW]?) throws -> String {
-    //        let messages = prepareMessages(prompt.asMessages())
-    //        let tokens = try tokenizer.applyChatTemplate(messages: messages)
-    //        return tokenizer.decode(tokens: tokens)
-    //    }
-
-    public func prepare(input: UserInput) throws -> LMInput {
-        // Text-only input
-        if input.images.isEmpty {
-            let messages = input.prompt.asMessages()
-            let promptTokens = try tokenizer.applyChatTemplate(messages: messages)
-            return LMInput(tokens: MLXArray(promptTokens))
+        let videoPixelsAndFrames = try videosAsImageSequences.map {
+            try preprocess(images: $0, processing: input.processing)
         }
-        // Input with images
-        let pixelsAndFrames = try input.images.map {
-            try preprocess(images: [$0.asCIImage()], processing: input.processing)
+        let processedVideo: LMInput.ProcessedVideo?
+        if !videoPixelsAndFrames.isEmpty {
+            let videoPixelsConcatenated = concatenated(videoPixelsAndFrames.map { $0.0 })
+            processedVideo = LMInput.ProcessedVideo(
+                pixels: videoPixelsConcatenated, videoGridThw: videoPixelsAndFrames.map { $0.1 })
+            if let videoFrames = processedVideo?.frames {
+                promptTokens = try replacePlaceholderTokens(
+                    in: promptTokens, frames: videoFrames, paddingToken: "<|video_pad|>")
+            }
+        } else {
+            processedVideo = nil
         }
+        //
+        let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
+        let mask = ones(like: promptArray).asType(.int8)
+        return LMInput(
+            text: .init(tokens: promptArray, mask: mask), image: processedImage,
+            video: processedVideo)
+    }
 
-        // var videosAsImageSequences = [[CIImage]]()
-        // for video in input.videos {
-        //     if let imageSequence = try? await MediaProcessing.asCIImageSequence(
-        //         video.asAVAsset(), samplesPerSecond: 2)
-        //     {
-        //         videosAsImageSequences.append(imageSequence)
-        //     }
-        // }
-        // let videos = try videosAsImageSequences.map {
-        //     try preprocess(images: $0, processing: input.processing)
-        // }
-
-        // let imagePixels: MLXArray?
-        // let image: LMInput.ProcessedImage?
-        // if !images.isEmpty {
-        //     imagePixels = concatenated(images.map { $0.0 })
-        //     image = LMInput.ProcessedImage(pixels: imagePixels!, imageGridThw: images.map { $0.1 })
-        // } else {
-        //     imagePixels = nil
-        //     image = nil
-        // }
-
-        // let videoPixels: MLXArray?
-        // let video: LMInput.ProcessedVideo?
-        // if !videos.isEmpty {
-        //     videoPixels = concatenated(videos.map { $0.0 })
-        //     video = LMInput.ProcessedVideo(pixels: videoPixels!, videoGridThw: videos.map { $0.1 })
-        // } else {
-        //     videoPixels = nil
-        //     video = nil
-        // }
-
-        // // processing_qwen2_vl.Qwen2VLProcessor
-        // let prompt = prepare(
-        //     prompt: input.prompt, imageTHW: image?.imageGridThw, videoTHW: video?.videoGridThw)
-        // let promptTokens = try tokenizer.encode(text: prompt)
-        // let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
-        // let mask = ones(like: promptArray).asType(.int8)
-
-        // return LMInput(text: .init(tokens: promptArray, mask: mask), image: image, video: video)
-        let pixelsConcatenated = concatenated(pixelsAndFrames.map { $0.0 })
-        let image = LMInput.ProcessedImage(
-            pixels: pixelsConcatenated, frames: pixelsAndFrames.map { $0.1 })
-        let messages = prepareMessages(input.prompt.asMessages())
-        var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
-        // Replace single image pad token with correct number for each image
-        let mergeLength = config.mergeSize * config.mergeSize
-        let imagePlaceholderTokens = try tokenizer.encode(
-            text: "<|vision_start|><|image_pad|><|vision_end|>")
-        guard let frames = image.frames else {
-            throw Qwen2VLProcessorError.framesIsNil
-        }
-        let placeholderRanges = promptTokens.ranges(of: imagePlaceholderTokens)
+    func replacePlaceholderTokens(in promptTokens: [Int], frames: [THW], paddingToken: String)
+        throws -> [Int]
+    {
+        // Replace single padding token with correct number for each image
+        let placeholderTokens = try tokenizer.encode(
+            text: "<|vision_start|>\(paddingToken)<|vision_end|>")
+        let placeholderRanges = promptTokens.ranges(of: placeholderTokens)
         guard placeholderRanges.count == frames.count else {
             throw VLMError.processing(
-                "Number of image placeholders does not match number of frames")
+                "Number of placeholder tokens does not match number of frames")
         }
-        let replacementSequences = try frames.map { thw in
-            let paddingCount = thw.product / mergeLength
+        let mergeLength = config.mergeSize * config.mergeSize
+        let replacementSequences = try frames.map { frame in
+            let paddingCount = frame.product / mergeLength
             return try tokenizer.encode(
                 text:
-                    "<|vision_start|>\(Array(repeating: "<|image_pad|>", count: paddingCount).joined())<|vision_end|>"
+                    "<|vision_start|>\(Array(repeating: paddingToken, count: paddingCount).joined())<|vision_end|>"
             )
         }
-        // Build the final array
+        // Build the final array (images)
         var result: [Int] = []
         var currentIndex = promptTokens.startIndex
         for (range, replacement) in zip(placeholderRanges, replacementSequences) {
@@ -848,10 +784,7 @@ public class Qwen2VLProcessor: UserInputProcessor {
         if currentIndex < promptTokens.endIndex {
             result.append(contentsOf: promptTokens[currentIndex...])
         }
-        promptTokens = result
-        let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
-        let mask = ones(like: promptArray).asType(.int8)
-        return LMInput(text: .init(tokens: promptArray, mask: mask), image: image)
+        return result
     }
 }
 
@@ -934,17 +867,17 @@ public class Qwen2VL: Module, VLMModel, KVCacheDimensionProvider {
 
         let dtype = visionModel.patchEmbed.proj.weight.dtype
 
-        let imageGridThw = input.image?.imageGridThw
+        let imageFrames = input.image?.frames
         let imagePixels = input.image?.pixels.asType(dtype)
 
-        let videoGridThw = input.video?.videoGridThw
+        let videoGridThw = input.video?.frames
         let videoPixels = input.video?.pixels.asType(dtype)
 
         let gridThw: [THW]?
         let pixels: MLXArray?
 
         if videoGridThw == nil {
-            gridThw = imageGridThw
+            gridThw = imageFrames
             pixels = imagePixels
         } else {
             gridThw = videoGridThw
diff --git a/Package.swift b/Package.swift
@@ -29,7 +29,7 @@ let package = Package(
     dependencies: [
         .package(url: "https://github.com/ml-explore/mlx-swift", .upToNextMinor(from: "0.21.2")),
         .package(
-            url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.16")
+            url: "https://github.com/huggingface/swift-transformers", .upToNextMinor(from: "0.1.17")
         ),
         .package(
             url: "https://github.com/apple/swift-async-algorithms", .upToNextMinor(from: "1.0.0")),

Original file line number	Diff line number	Diff line change
`@@ -134,9 +134,10 @@ public struct UserInput: Sendable {`
`134`	`134`	`self.videos = videos`
`135`	`135`	`}`
`136`	`136`
`137`		`- public init(messages: [Message], images: [Image] = [Image]()) {`
	`137`	`+ public init(messages: [Message], images: [Image] = [Image](), videos: [Video] = [Video]()) {`
`138`	`138`	`self.prompt = .messages(messages)`
`139`	`139`	`self.images = images`
	`140`	`+ self.videos = videos`
`140`	`141`	`}`
`141`	`142`
`142`	`143`	`public init(prompt: Prompt, images: [Image] = [Image](), processing: Processing = .init()) {`