Fix and factor out checkArrayShape

DePasqualeOrg · DePasqualeOrg · commit 82df136c7221 · 2025-06-28T12:24:03.000+02:00
diff --git a/Libraries/MLXVLM/CheckArrayShape.swift b/Libraries/MLXVLM/CheckArrayShape.swift
@@ -0,0 +1,24 @@
+import MLX
+
+/// Check if array is in a supported format for conv weights
+public func checkArrayShape(_ arr: MLXArray) -> Bool {
+    let shape = arr.shape
+    switch shape.count {
+    case 4:
+        let outChannels = shape[0]
+        let kH = shape[1]
+        let kW = shape[2]
+        // shape[3] is in_channels, which is ignored
+        // Check if out_channels is the largest, and kH and kW are the same
+        return (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
+    case 3:
+        let kW = shape[1]
+        let outChannels = shape[2]
+        // shape[0] is ignored
+        // Check if kW is larger than or equal to out_channels
+        return kW >= outChannels
+    default:
+        // Any other number of dimensions is not supported
+        return false
+    }
+}
diff --git a/Libraries/MLXVLM/Models/Gemma3.swift b/Libraries/MLXVLM/Models/Gemma3.swift
@@ -756,19 +756,6 @@ private class VisionModel: Module {
         visionModel(x, outputHiddenStates: outputHiddenStates)
     }
 
-    /// Check if array is already in MLX format for conv2d weights
-    private func checkArrayShape(_ arr: MLXArray) -> Bool {
-        let shape = arr.shape
-
-        // Check if the shape has 4 dimensions
-        guard shape.count == 4 else { return false }
-
-        let (outChannels, kH, kW, _) = (shape[0], shape[1], shape[2], shape[3])
-
-        // Check if out_channels is the largest, and kH and kW are the same
-        return (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
-    }
-
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
 
diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1652,20 +1652,6 @@ private func maskedScatter(
     return resultFlat.reshaped(inputShape)
 }
 
-private func checkArrayShape(_ arr: MLXArray) -> Bool {
-    let shape = arr.shape
-    guard shape.count == 4 else {
-        return false
-    }
-
-    let (outChannels, kH, kW, _) = (shape[0], shape[1], shape[2], shape[3])
-    let result = (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
-    print(
-        "🔍 checkArrayShape: shape=\(shape), outChannels=\(outChannels), kH=\(kH), kW=\(kW), result=\(result)"
-    )
-    return result
-}
-
 // MARK: - Main Model
 
 public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
@@ -3925,27 +3911,31 @@ private class Gemma3nAudioModel: Module {
         return (audioencodings, currentMask)
     }
 
+    /// Sanitizes weights by transposing convolution layers if they are not
+    /// already in the expected MLX format.
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
 
         for (k, v) in weights {
             if k.contains("conv.weight") {
-                // The checkArrayShape function is not robust.
-                // The Python implementation doesn't use it. It's safer to just transpose.
-                // Assuming NCHW -> NHWC for Conv2d
-                if v.ndim == 4 {
+                // A Conv2D weight should be 4D.
+                // If it is, check if it needs transposing from NCHW to NHWC.
+                // If checkArrayShape is true, it's already in the correct format.
+                if v.ndim == 4 && !checkArrayShape(v) {
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 } else {
                     sanitizedWeights[k] = v
                 }
             } else if k.contains("conv1d.weight") {
-                // Assuming NCL -> NLC for Conv1d
-                if v.ndim == 3 {
+                // A Conv1D weight should be 3D.
+                // If it is, check if it needs transposing from NCL to NLC.
+                if v.ndim == 3 && !checkArrayShape(v) {
                     sanitizedWeights[k] = v.transposed(0, 2, 1)
                 } else {
                     sanitizedWeights[k] = v
                 }
             } else {
+                // For all other weights, keep them as they are.
                 sanitizedWeights[k] = v
             }
         }
@@ -4149,7 +4139,6 @@ public struct Gemma3nProcessorConfiguration: Codable, Sendable {
     public let doConvertRgb: Bool?
     public let doPanAndScan: Bool?
 
-    // Token identifiers - use default values that match Python implementation
     public var imageTokenId: Int { 262145 }
     public var audioTokenId: Int { 262273 }
 
diff --git a/Libraries/MLXVLM/Models/Idefics3.swift b/Libraries/MLXVLM/Models/Idefics3.swift
@@ -386,12 +386,6 @@ private enum Language {
 // MARK: - Vision
 
 private enum Vision {
-    static func checkArrayShape(_ arr: MLXArray) -> Bool {
-        if arr.ndim != 4 { return false }
-        let (o, h, w, _) = (arr.dim(0), arr.dim(1), arr.dim(2), arr.dim(3))
-        return (o >= h && o >= w && h == w)
-    }
-
     fileprivate class Attention: Module {
         let numHeads: Int
         let scale: Float
@@ -602,7 +596,7 @@ private enum Vision {
                 if k.contains("position_ids") {
                     continue
                 } else if k.contains("patch_embedding.weight") {
-                    if Vision.checkArrayShape(v) {
+                    if checkArrayShape(v) {
                         sanitizedWeights[k] = v
                     } else {
                         sanitizedWeights[k] = v.transposed(0, 2, 3, 1)