Fix media downsampling

DePasqualeOrg · DePasqualeOrg · commit 102aa97a103a · 2025-03-06T08:50:24.000+01:00
diff --git a/Libraries/MLXVLM/MediaProcessing.swift b/Libraries/MLXVLM/MediaProcessing.swift
@@ -15,7 +15,7 @@ private let context = CIContext()
 /// var image: CIImage
 /// image = MediaProcessing.inSRGBToneCurveSpace(image)
 ///
-/// // apply user instructions
+/// // Apply user instructions
 /// image = MediaProcessing.apply(image, processing: processing)
 ///
 /// image = MediaProcessing.resampleBicubic(image, to: config.size.cgSize)
@@ -59,6 +59,10 @@ public enum MediaProcessing {
     }
 
     /// Resample the image using bicubic interpolation.
+    /// - Parameters:
+    ///   - image: The image to resample
+    ///   - size: The target size
+    /// - Returns: The resampled image
     static public func resampleBicubic(_ image: CIImage, to size: CGSize) -> CIImage {
         let filter = CIFilter.bicubicScaleTransform()
         let extent = image.extent.size
@@ -70,19 +74,13 @@ public enum MediaProcessing {
         let desiredAspectRatio = size.width / size.height
         filter.aspectRatio = Float(1 / inputAspectRatio * desiredAspectRatio)
 
-        // that image is now the aspect ratio of the target and the size
-        // of the shorter dimension
-        let scale: CGFloat
-        if extent.width < extent.height {
-            scale = size.width / extent.width
-        } else {
-            scale = size.height / extent.height
-        }
+        // Use the same scaling approach regardless of orientation
+        let scale = min(size.width / extent.width, size.height / extent.height)
         filter.scale = Float(scale)
 
         let rescaled = filter.outputImage!
 
-        // the image has a DoD larger than the requested size so crop
+        // The image has a DoD larger than the requested size, so crop
         // it to the desired size
         return rescaled.cropped(to: CGRect(origin: .zero, size: size))
     }
@@ -94,7 +92,7 @@ public enum MediaProcessing {
         let filter = CIFilter.colorMatrix()
         filter.inputImage = image
 
-        // this should match
+        // This should match
         // https://pytorch.org/vision/main/generated/torchvision.transforms.Normalize.html
         //
         // output[channel] = (input[channel] - mean[channel]) / std[channel]
@@ -113,6 +111,10 @@ public enum MediaProcessing {
     }
 
     /// Convert the CIImage into a planar 3 channel MLXArray `[1, C, H, W]`
+    /// - Parameters:
+    ///   - image: The image to convert
+    ///   - colorSpace: Optional color space for rendering
+    /// - Returns: The MLXArray representation of the image
     static public func asMLXArray(_ image: CIImage, colorSpace: CGColorSpace? = nil) -> MLXArray {
         let size = image.extent.size
         let w = Int(size.width.rounded())
@@ -135,10 +137,10 @@ public enum MediaProcessing {
 
         var array = MLXArray(data, [h, w, 4], type: Float32.self)
 
-        // drop 4th channel
+        // Drop 4th channel
         array = array[0..., 0..., ..<3]
 
-        // convert to 1, C, H, W
+        // Convert to 1, C, H, W
         array = array.reshaped(1, h, w, 3).transposed(0, 3, 1, 2)
 
         return array
diff --git a/Libraries/MLXVLM/Models/QwenVL.swift b/Libraries/MLXVLM/Models/QwenVL.swift
@@ -123,20 +123,25 @@ public struct QwenVL {
     {
         if height < factor {
             throw VLMError.imageProcessingFailure(
-                "height: \(height) must be larger than factor: \(factor)")
+                "Height: \(height) must be larger than factor: \(factor)")
         }
         if width < factor {
             throw VLMError.imageProcessingFailure(
-                "width: \(width) must be larger than factor: \(factor)")
+                "Width: \(width) must be larger than factor: \(factor)")
         }
         if max(height, width) / min(height, width) > 200 {
             throw VLMError.imageProcessingFailure(
-                "absolute aspect ratio must be smaller than 200: \(width)x\(height)")
+                "Absolute aspect ratio must be smaller than 200: \(width) × \(height)")
         }
 
+        // Maximum allowed dimension for any single side to prevent buffer overflows
+        // This is important for portrait/landscape images with extreme aspect ratios
+        let maxDimension = 224
+
         var hBar = max(factor, Int(round(Float(height) / Float(factor))) * factor)
         var wBar = max(factor, Int(round(Float(width) / Float(factor))) * factor)
 
+        // Start by scaling based on total pixel count
         if hBar * wBar > maxPixels {
             let beta = sqrt(Float(height * width) / Float(maxPixels))
             hBar = Int(floor(Float(height) / beta / Float(factor))) * factor
@@ -146,6 +151,24 @@ public struct QwenVL {
             hBar = Int(ceil(Float(height) * beta / Float(factor))) * factor
             wBar = Int(ceil(Float(width) * beta / Float(factor))) * factor
         }
+
+        // Additionally check if either dimension exceeds the maximum allowed
+        if hBar > maxDimension {
+            // Calculate how much we need to scale down height
+            let scale = Float(maxDimension) / Float(hBar)
+            // Apply that scale to both dimensions to maintain aspect ratio
+            hBar = Int(round(Float(hBar) * scale / Float(factor))) * factor
+            wBar = Int(round(Float(wBar) * scale / Float(factor))) * factor
+        }
+
+        if wBar > maxDimension {
+            // Calculate how much we need to scale down width
+            let scale = Float(maxDimension) / Float(wBar)
+            // Apply that scale to both dimensions to maintain aspect ratio
+            hBar = Int(round(Float(hBar) * scale / Float(factor))) * factor
+            wBar = Int(round(Float(wBar) * scale / Float(factor))) * factor
+        }
+
         return (hBar, wBar)
     }