From 986bb5222f298ed8b1f65a29127ec36f4e3d06c0 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Fri, 27 Jun 2025 00:55:43 +0200
Subject: [PATCH 01/19] Builds without errors

---
 Libraries/MLXVLM/Models/Gemma3n.swift  | 4049 ++++++++++++++++++++++++
 Libraries/MLXVLM/VLMModelFactory.swift |   17 +
 2 files changed, 4066 insertions(+)
 create mode 100644 Libraries/MLXVLM/Models/Gemma3n.swift

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
new file mode 100644
index 00000000..3ef06b1c
--- /dev/null
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -0,0 +1,4049 @@
+//
+//  Gemma3n.swift
+//  mlx-swift-examples
+//
+//  Created by Anthony DePasquale on 27.06.2025.
+//
+
+import CoreImage
+import MLX
+import MLXFast
+import MLXLMCommon
+import MLXNN
+import Tokenizers
+
+// MARK: - Configuration Classes
+
+// Protocol for multimodal configs that can be used with Gemma3nMultimodalEmbedder
+public protocol MultimodalConfig {
+    var hiddenSize: Int { get }
+    var rmsNormEps: Float { get }
+    var vocabOffset: Int { get }
+    var vocabSize: Int { get }
+}
+
+public struct AudioConfig: Codable, Sendable, MultimodalConfig {
+    public let inputFeatSize: Int
+    public let hiddenSize: Int
+    public let confAttentionChunkSize: Int
+    public let confAttentionContextLeft: Int
+    public let confAttentionContextRight: Int
+    public let confAttentionInvalidLogitsValue: Float
+    public let confAttentionLogitCap: Float
+    public let confNumAttentionHeads: Int
+    public let confNumHiddenLayers: Int
+    public let confConvKernelSize: Int
+    public let confPositionalBiasSize: Int
+    public let confReductionFactor: Int
+    public let confResidualWeight: Float
+    public let sscpConvChannelSize: [Int]
+    public let sscpConvGroupNormEps: Float
+    public let sscpConvKernelSize: [[Int]]
+    public let sscpConvStrideSize: [[Int]]
+    public let vocabSize: Int
+    public let sscpConvEps: Float
+    public let rmsNormEps: Float
+    public let gradientClipping: Float
+    public let vocabOffset: Int
+
+    public init(
+        inputFeatSize: Int = 80,
+        hiddenSize: Int = 1536,
+        confAttentionChunkSize: Int = 12,
+        confAttentionContextLeft: Int = 13,
+        confAttentionContextRight: Int = 0,
+        confAttentionInvalidLogitsValue: Float = -1e9,
+        confAttentionLogitCap: Float = 50.0,
+        confNumAttentionHeads: Int = 8,
+        confNumHiddenLayers: Int = 12,
+        confConvKernelSize: Int = 5,
+        confPositionalBiasSize: Int = 256,
+        confReductionFactor: Int = 4,
+        confResidualWeight: Float = 0.5,
+        sscpConvChannelSize: [Int] = [128, 32],
+        sscpConvGroupNormEps: Float = 1e-3,
+        sscpConvKernelSize: [[Int]] = [[3, 3], [3, 3]],
+        sscpConvStrideSize: [[Int]] = [[2, 2], [2, 2]],
+        vocabSize: Int = 128,
+        sscpConvEps: Float = 1e-3,
+        rmsNormEps: Float = 1e-6,
+        gradientClipping: Float = 10000000000.0,
+        vocabOffset: Int = 262272  // 262_144 + 128 (text vocab size + vision vocab size)
+    ) {
+        self.inputFeatSize = inputFeatSize
+        self.hiddenSize = hiddenSize
+        self.confAttentionChunkSize = confAttentionChunkSize
+        self.confAttentionContextLeft = confAttentionContextLeft
+        self.confAttentionContextRight = confAttentionContextRight
+        self.confAttentionInvalidLogitsValue = confAttentionInvalidLogitsValue
+        self.confAttentionLogitCap = confAttentionLogitCap
+        self.confNumAttentionHeads = confNumAttentionHeads
+        self.confNumHiddenLayers = confNumHiddenLayers
+        self.confConvKernelSize = confConvKernelSize
+        self.confPositionalBiasSize = confPositionalBiasSize
+        self.confReductionFactor = confReductionFactor
+        self.confResidualWeight = confResidualWeight
+        self.sscpConvChannelSize = sscpConvChannelSize
+        self.sscpConvGroupNormEps = sscpConvGroupNormEps
+        self.sscpConvKernelSize = sscpConvKernelSize
+        self.sscpConvStrideSize = sscpConvStrideSize
+        self.vocabSize = vocabSize
+        self.sscpConvEps = sscpConvEps
+        self.rmsNormEps = rmsNormEps
+        self.gradientClipping = gradientClipping
+        self.vocabOffset = vocabOffset
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case inputFeatSize = "input_feat_size"
+        case hiddenSize = "hidden_size"
+        case confAttentionChunkSize = "conf_attention_chunk_size"
+        case confAttentionContextLeft = "conf_attention_context_left"
+        case confAttentionContextRight = "conf_attention_context_right"
+        case confAttentionInvalidLogitsValue = "conf_attention_invalid_logits_value"
+        case confAttentionLogitCap = "conf_attention_logit_cap"
+        case confNumAttentionHeads = "conf_num_attention_heads"
+        case confNumHiddenLayers = "conf_num_hidden_layers"
+        case confConvKernelSize = "conf_conv_kernel_size"
+        case confPositionalBiasSize = "conf_positional_bias_size"
+        case confReductionFactor = "conf_reduction_factor"
+        case confResidualWeight = "conf_residual_weight"
+        case sscpConvChannelSize = "sscp_conv_channel_size"
+        case sscpConvGroupNormEps = "sscp_conv_group_norm_eps"
+        case sscpConvKernelSize = "sscp_conv_kernel_size"
+        case sscpConvStrideSize = "sscp_conv_stride_size"
+        case vocabSize = "vocab_size"
+        case sscpConvEps = "sscp_conv_eps"
+        case rmsNormEps = "rms_norm_eps"
+        case gradientClipping = "gradient_clipping"
+        case vocabOffset = "vocab_offset"
+    }
+}
+
+public struct VisionConfig: Codable, Sendable, MultimodalConfig {
+    public let modelType: String
+    public let numHiddenLayers: Int
+    public let hiddenSize: Int
+    public let intermediateSize: Int
+    public let numAttentionHeads: Int
+    public let patchSize: Int
+    public let imageSize: Int
+    public let numChannels: Int
+    public let rmsNormEps: Float
+    public let vocabSize: Int
+    public let vocabOffset: Int
+
+    public init(
+        modelType: String = "gemma3n_vision",
+        numHiddenLayers: Int = 12,
+        hiddenSize: Int = 2048,
+        intermediateSize: Int = 8192,
+        numAttentionHeads: Int = 16,
+        patchSize: Int = 16,
+        imageSize: Int = 224,
+        numChannels: Int = 3,
+        rmsNormEps: Float = 1e-6,
+        vocabSize: Int = 128,
+        vocabOffset: Int = 262144
+    ) {
+        self.modelType = modelType
+        self.numHiddenLayers = numHiddenLayers
+        self.hiddenSize = hiddenSize
+        self.intermediateSize = intermediateSize
+        self.numAttentionHeads = numAttentionHeads
+        self.patchSize = patchSize
+        self.imageSize = imageSize
+        self.numChannels = numChannels
+        self.rmsNormEps = rmsNormEps
+        self.vocabSize = vocabSize
+        self.vocabOffset = vocabOffset
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case modelType = "model_type"
+        case numHiddenLayers = "num_hidden_layers"
+        case hiddenSize = "hidden_size"
+        case intermediateSize = "intermediate_size"
+        case numAttentionHeads = "num_attention_heads"
+        case patchSize = "patch_size"
+        case imageSize = "image_size"
+        case numChannels = "num_channels"
+        case rmsNormEps = "rms_norm_eps"
+        case vocabSize = "vocab_size"
+        case vocabOffset = "vocab_offset"
+    }
+}
+
+public struct TextConfig: Codable, Sendable {
+    public let modelType: String
+    public let hiddenSize: Int
+    public let numHiddenLayers: Int
+    public let intermediateSize: [Int]
+    public let numAttentionHeads: Int
+    public let headDim: Int
+    public let rmsNormEps: Float
+    public let vocabSize: Int
+    public let vocabSizePerLayerInput: Int
+    public let numKeyValueHeads: Int
+    public let laurelRank: Int
+    public let fracSharedLayers: Float
+    public let altupActiveIdx: Int
+    public let padTokenId: Int
+    public let altupNumInputs: Int
+    public let altupCoefClip: Float?
+    public let altupCorrectScale: Bool
+    public let hiddenSizePerLayerInput: Int
+    public let ropeLocalBaseFreq: Float
+    public let ropeTraditional: Bool
+    public let ropeTheta: Float
+    public let queryPreAttnScalar: Float
+    public let slidingWindow: Int
+    public let ropeScaling: [String: StringOrNumber]?
+    public let mmTokensPerImage: Int
+    public let slidingWindowPattern: Int
+    public let activationSparsityPattern: [Float]?
+    public let finalLogitSoftcapping: Float
+    public let queryRescaleScalar: Float
+    public let numKvSharedLayers: Int
+    public let maxPositionEmbeddings: Int
+    public let attnLogitSoftcapping: Float
+    public let layerTypes: [String]
+
+    enum CodingKeys: String, CodingKey {
+        case modelType = "model_type"
+        case hiddenSize = "hidden_size"
+        case numHiddenLayers = "num_hidden_layers"
+        case intermediateSize = "intermediate_size"
+        case numAttentionHeads = "num_attention_heads"
+        case headDim = "head_dim"
+        case rmsNormEps = "rms_norm_eps"
+        case vocabSize = "vocab_size"
+        case vocabSizePerLayerInput = "vocab_size_per_layer_input"
+        case numKeyValueHeads = "num_key_value_heads"
+        case laurelRank = "laurel_rank"
+        case fracSharedLayers = "frac_shared_layers"
+        case altupActiveIdx = "altup_active_idx"
+        case padTokenId = "pad_token_id"
+        case altupNumInputs = "altup_num_inputs"
+        case altupCoefClip = "altup_coef_clip"
+        case altupCorrectScale = "altup_correct_scale"
+        case hiddenSizePerLayerInput = "hidden_size_per_layer_input"
+        case ropeLocalBaseFreq = "rope_local_base_freq"
+        case ropeTraditional = "rope_traditional"
+        case ropeTheta = "rope_theta"
+        case queryPreAttnScalar = "query_pre_attn_scalar"
+        case slidingWindow = "sliding_window"
+        case ropeScaling = "rope_scaling"
+        case mmTokensPerImage = "mm_tokens_per_image"
+        case slidingWindowPattern = "sliding_window_pattern"
+        case activationSparsityPattern = "activation_sparsity_pattern"
+        case finalLogitSoftcapping = "final_logit_softcapping"
+        case queryRescaleScalar = "query_rescale_scalar"
+        case numKvSharedLayers = "num_kv_shared_layers"
+        case maxPositionEmbeddings = "max_position_embeddings"
+        case attnLogitSoftcapping = "attn_logit_softcapping"
+        case layerTypes = "layer_types"
+    }
+}
+
+public struct ModelConfig: Codable, Sendable {
+    public let textConfig: TextConfig
+    public let visionConfig: VisionConfig
+    public let audioConfig: AudioConfig
+    public let modelType: String
+    public let vocabSize: Int
+    public let ignoreIndex: Int
+    public let imageTokenIndex: Int
+    public let audioTokenId: Int
+    public let imageTokenId: Int
+    public let hiddenSize: Int
+    public let padTokenId: Int
+    public let visionSoftTokensPerImage: Int
+    public let audioSoftTokensPerImage: Int
+    public let eosTokenId: [Int]?
+
+    enum CodingKeys: String, CodingKey {
+        case textConfig = "text_config"
+        case visionConfig = "vision_config"
+        case audioConfig = "audio_config"
+        case modelType = "model_type"
+        case vocabSize = "vocab_size"
+        case ignoreIndex = "ignore_index"
+        case imageTokenIndex = "image_token_index"
+        case audioTokenId = "audio_token_id"
+        case imageTokenId = "image_token_id"
+        case hiddenSize = "hidden_size"
+        case padTokenId = "pad_token_id"
+        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case audioSoftTokensPerImage = "audio_soft_tokens_per_image"
+        case eosTokenId = "eos_token_id"
+    }
+}
+
+// MARK: - Language Model Components
+
+private class Gemma3nRMSNorm: Module, UnaryLayer {
+    let eps: Float
+    let scaleShift: Float
+    let withScale: Bool
+    @ModuleInfo var weight: MLXArray?
+
+    init(dim: Int, eps: Float = 1e-6, scaleShift: Float = 0.0, withScale: Bool = true) {
+        self.eps = eps
+        self.scaleShift = scaleShift
+        self.withScale = withScale
+
+        if withScale {
+            self._weight.wrappedValue = MLXArray.ones([dim])
+        } else {
+            self._weight.wrappedValue = nil
+        }
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let output = norm(x.asType(.float32))
+
+        if withScale, let weight = weight {
+            return (output * (weight + scaleShift)).asType(x.dtype)
+        }
+
+        return output.asType(x.dtype)
+    }
+
+    private func norm(_ x: MLXArray) -> MLXArray {
+        return x * rsqrt(x.square().mean(axis: -1, keepDims: true) + eps)
+    }
+}
+
+private class Gemma3nLaurelBlock: Module {
+    @ModuleInfo var linearLeft: Linear
+    @ModuleInfo var linearRight: Linear
+    @ModuleInfo var postLaurelNorm: Gemma3nRMSNorm
+
+    init(config: TextConfig) {
+        self._linearLeft.wrappedValue = Linear(config.hiddenSize, config.laurelRank, bias: false)
+        self._linearRight.wrappedValue = Linear(config.laurelRank, config.hiddenSize, bias: false)
+        self._postLaurelNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let laurelX = linearLeft(x)
+        let laurelX2 = linearRight(laurelX)
+        let normedLaurelX = postLaurelNorm(laurelX2)
+        return x + normedLaurelX
+    }
+}
+
+private func rotateHalf(_ x: MLXArray) -> MLXArray {
+    let half = x.shape.last! / 2
+    let x1 = x[.ellipsis, ..<half]
+    let x2 = x[.ellipsis, half...]
+    return concatenated([-x2, x1], axis: -1)
+}
+
+private func applyRotaryPosEmb(
+    _ x: MLXArray,
+    cos: MLXArray,
+    sin: MLXArray,
+    unsqueezeDim: Int = 1
+) -> MLXArray {
+    let cosExpanded = expandedDimensions(cos, axis: unsqueezeDim)
+    let sinExpanded = expandedDimensions(sin, axis: unsqueezeDim)
+    return (x * cosExpanded) + (rotateHalf(x) * sinExpanded)
+}
+
+private class Gemma3nRotaryEmbedding: Module {
+    let ropeType: String
+    let maxSeqLenCached: Int
+    let originalMaxSeqLen: Int
+    let config: TextConfig
+    let attentionScaling: Float
+    @ModuleInfo var invFreq: MLXArray
+    @ModuleInfo var originalInvFreq: MLXArray
+
+    init(config: TextConfig) {
+        if let ropeScaling = config.ropeScaling {
+            let ropeTypeValue = ropeScaling["rope_type"] ?? ropeScaling["type"]
+            if case .string(let typeString) = ropeTypeValue {
+                self.ropeType = typeString
+            } else {
+                self.ropeType = "default"
+            }
+        } else {
+            self.ropeType = "default"
+        }
+
+        self.maxSeqLenCached = config.maxPositionEmbeddings
+        self.originalMaxSeqLen = config.maxPositionEmbeddings
+        self.config = config
+        self.attentionScaling = 1.0
+
+        let (invFreq, _) = Self.computeDefaultRopeParameters(config: config)
+        self._invFreq.wrappedValue = MLXArray(invFreq).asType(.float32)
+        self._originalInvFreq.wrappedValue = MLXArray(invFreq).asType(.float32)
+
+        super.init()
+    }
+
+    static func computeDefaultRopeParameters(config: TextConfig) -> ([Float], Float) {
+        let base = config.ropeTheta
+        let partialRotaryFactor: Float = 1.0
+        let headDim = config.headDim
+        let dim = Int(Float(headDim) * partialRotaryFactor)
+
+        let attentionFactor: Float = 1.0
+
+        let invFreqArray: [Float] = stride(from: 0, to: dim, by: 2).map { i in
+            1.0 / pow(base, Float(i) / Float(dim))
+        }
+
+        return (invFreqArray, attentionFactor)
+    }
+
+    func callAsFunction(_ x: MLXArray, positionIds: MLXArray) -> (MLXArray, MLXArray) {
+        let invFreqExpanded = expandedDimensions(invFreq, axes: [0, 2])
+        let positionIdsExpanded = expandedDimensions(positionIds.asType(.float32), axes: [1])
+
+        let freqs = matmul(
+            invFreqExpanded.asType(.float32),
+            positionIdsExpanded.asType(.float32)
+        ).transposed(0, 2, 1)
+
+        let emb = concatenated([freqs, freqs], axis: -1)
+        let cosEmb = cos(emb) * attentionScaling
+        let sinEmb = sin(emb) * attentionScaling
+
+        return (cosEmb.asType(x.dtype), sinEmb.asType(x.dtype))
+    }
+}
+
+private class Gemma3nAttention: Module {
+    let isSliding: Bool
+    let attnLogitSoftcapping: Float
+    let numHeads: Int
+    let numKVHeads: Int
+    let repeats: Int
+    let headDim: Int
+    let layerIdx: Int
+    let scale: Float
+    let isKvSharedLayer: Bool
+    let kvSharedLayerIndex: Int?
+
+    @ModuleInfo var qProj: Linear
+    @ModuleInfo var kProj: Linear
+    @ModuleInfo var vProj: Linear
+    @ModuleInfo var oProj: Linear
+    @ModuleInfo var qNorm: Gemma3nRMSNorm
+    @ModuleInfo var kNorm: Gemma3nRMSNorm
+    @ModuleInfo var vNorm: Gemma3nRMSNorm
+
+    init(config: TextConfig, layerIdx: Int) {
+        self.isSliding = config.layerTypes[layerIdx] == "sliding_attention"
+        self.attnLogitSoftcapping = config.attnLogitSoftcapping
+
+        let dim = config.hiddenSize
+        self.numHeads = config.numAttentionHeads
+        self.numKVHeads = config.numKeyValueHeads
+        self.repeats = numHeads / numKVHeads
+        self.headDim = config.headDim
+        self.layerIdx = layerIdx
+        self.scale = 1.0
+
+        self._qProj.wrappedValue = Linear(dim, numHeads * headDim, bias: false)
+        self._kProj.wrappedValue = Linear(dim, numKVHeads * headDim, bias: false)
+        self._vProj.wrappedValue = Linear(dim, numKVHeads * headDim, bias: false)
+        self._oProj.wrappedValue = Linear(numHeads * headDim, dim, bias: false)
+
+        self._qNorm.wrappedValue = Gemma3nRMSNorm(dim: config.headDim, eps: config.rmsNormEps)
+        self._kNorm.wrappedValue = Gemma3nRMSNorm(dim: config.headDim, eps: config.rmsNormEps)
+        self._vNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.headDim,
+            eps: config.rmsNormEps,
+            withScale: false
+        )
+
+        let firstKvSharedLayerIdx = config.numHiddenLayers - config.numKvSharedLayers
+        self.isKvSharedLayer = layerIdx >= firstKvSharedLayerIdx
+
+        if !isKvSharedLayer {
+            self.kvSharedLayerIndex = nil
+        } else if isSliding {
+            self.kvSharedLayerIndex = firstKvSharedLayerIdx - 2
+        } else {
+            self.kvSharedLayerIndex = firstKvSharedLayerIdx - 1
+        }
+
+        super.init()
+    }
+
+    func callAsFunction(
+        _ x: MLXArray,
+        mask: MLXFast.ScaledDotProductAttentionMaskMode? = nil,
+        cache: KVCache? = nil,
+        caches: [KVCache?]? = nil,
+        positionEmbeddings: (MLXArray, MLXArray)? = nil,
+        cachePosition: MLXArray? = nil
+    ) -> MLXArray {
+        let inputShape = Array(x.shape.dropLast())
+        let hiddenShape = inputShape + [-1, headDim]
+
+        guard let (cos, sin) = positionEmbeddings else {
+            fatalError("Position embeddings are required")
+        }
+
+        var queries = qProj(x)
+        queries = queries.reshaped(hiddenShape)
+        queries = qNorm(queries)
+        queries = applyRotaryPosEmb(queries, cos: cos, sin: sin, unsqueezeDim: 2)
+        queries = queries.transposed(0, 2, 1, 3)
+
+        var keys: MLXArray
+        var values: MLXArray
+
+        if isKvSharedLayer,
+            let kvSharedLayerIndex = kvSharedLayerIndex,
+            let cache = cache,
+            let caches = caches,
+            kvSharedLayerIndex < caches.count,
+            let sharedCache = caches[kvSharedLayerIndex]
+        {
+            // Use shared KV from designated cache layer
+            let sharedState = sharedCache.state
+            if sharedState.count >= 2 {
+                keys = sharedState[0]
+                values = sharedState[1]
+            } else {
+                // Fallback: compute KV normally if shared cache is empty
+                keys = kProj(x).reshaped(hiddenShape)
+                keys = kNorm(keys)
+                keys = applyRotaryPosEmb(keys, cos: cos, sin: sin, unsqueezeDim: 2)
+                keys = keys.transposed(0, 2, 1, 3)
+
+                values = vProj(x).reshaped(hiddenShape)
+                values = vNorm(values)
+                values = values.transposed(0, 2, 1, 3)
+            }
+        } else {
+            keys = kProj(x).reshaped(hiddenShape)
+            keys = kNorm(keys)
+            keys = applyRotaryPosEmb(keys, cos: cos, sin: sin, unsqueezeDim: 2)
+            keys = keys.transposed(0, 2, 1, 3)
+
+            values = vProj(x).reshaped(hiddenShape)
+            values = vNorm(values)
+            values = values.transposed(0, 2, 1, 3)
+        }
+
+        // Repeat keys and values for multi-head attention
+        keys = repeated(keys, count: repeats, axis: 1)
+        values = repeated(values, count: repeats, axis: 1)
+
+        // Use custom attention function that supports both quantized cache and logit softcapping
+        let output = gemma3nAttentionWithCacheUpdate(
+            queries: queries,
+            keys: keys,
+            values: values,
+            cache: cache,
+            scale: scale,
+            attnLogitSoftcapping: attnLogitSoftcapping,
+            mask: mask ?? .none
+        )
+        .transposed(0, 2, 1, 3)
+        .reshaped(inputShape + [-1])
+
+        return oProj(output)
+    }
+}
+
+private class MLP: Module, UnaryLayer {
+    @ModuleInfo var gateProj: Linear
+    @ModuleInfo var upProj: Linear
+    @ModuleInfo var downProj: Linear
+
+    let config: TextConfig
+    let activationSparsity: Float
+
+    init(config: TextConfig, layerIdx: Int = 0) {
+        self.config = config
+        let hiddenSize = config.hiddenSize
+        let intermediateSize = config.intermediateSize[0]
+
+        self._gateProj.wrappedValue = Linear(hiddenSize, intermediateSize, bias: false)
+        self._upProj.wrappedValue = Linear(hiddenSize, intermediateSize, bias: false)
+        self._downProj.wrappedValue = Linear(intermediateSize, hiddenSize, bias: false)
+
+        if let activationSparsityPattern = config.activationSparsityPattern {
+            self.activationSparsity = activationSparsityPattern[layerIdx]
+        } else {
+            self.activationSparsity = 0.0
+        }
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        var gateProj = self.gateProj(x)
+        if activationSparsity > 0.0 {
+            gateProj = gaussianTopK(gateProj)
+        }
+        let activations = geluApproximate(gateProj)
+        let upProj = self.upProj(x)
+        let downProj = self.downProj(activations * upProj)
+        return downProj
+    }
+
+    private func gaussianTopK(_ inputs: MLXArray) -> MLXArray {
+        let p = MLXArray(activationSparsity, dtype: .float32)
+        let stdMultiplier = sqrt(2.0) * erfInverse(2 * p - 1)
+        let stdMultiplierCasted = stdMultiplier.asType(inputs.dtype)
+        let inputsMean = mean(inputs, axis: -1, keepDims: true)
+        let inputsStd = std(inputs, axis: -1, keepDims: true)
+        let cutoffX = inputsMean + inputsStd * stdMultiplierCasted
+        return maximum(0, inputs - cutoffX)
+    }
+}
+
+private class Gemma3nAltUp: Module {
+    @ModuleInfo var correctOutputScale: MLXArray
+    @ModuleInfo var correctionCoefs: Linear
+    @ModuleInfo var predictionCoefs: Linear
+    @ModuleInfo var modalityRouter: Linear
+    @ModuleInfo var routerNorm: Gemma3nRMSNorm
+    @ModuleInfo var routerInputScale: MLXArray
+
+    let config: TextConfig
+
+    init(config: TextConfig) {
+        self.config = config
+
+        self._correctOutputScale.wrappedValue = MLXArray.zeros([config.hiddenSize])
+        self._correctionCoefs.wrappedValue = Linear(
+            config.altupNumInputs,
+            config.altupNumInputs,
+            bias: false
+        )
+        self._predictionCoefs.wrappedValue = Linear(
+            config.altupNumInputs,
+            config.altupNumInputs * config.altupNumInputs,
+            bias: false
+        )
+        self._modalityRouter.wrappedValue = Linear(
+            config.hiddenSize,
+            config.altupNumInputs,
+            bias: false
+        )
+        self._routerNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+        self._routerInputScale.wrappedValue = MLXArray(pow(Float(config.hiddenSize), -1.0))
+
+        super.init()
+    }
+
+    func computeRouterModalities(_ x: MLXArray) -> MLXArray {
+        let routerInputs =
+            routerNorm(x) * routerInputScale.asType(routerNorm.weight?.dtype ?? x.dtype)
+        let routed = modalityRouter(routerInputs).asType(.float32)
+        return tanh(routed)
+    }
+
+    func predict(_ x: MLXArray) -> MLXArray {
+        let modalities = computeRouterModalities(x[config.altupActiveIdx])
+
+        var predictionCoefsWeight = predictionCoefs.weight.asType(.float32)
+
+        if let altupCoefClip = config.altupCoefClip {
+            predictionCoefsWeight = clip(
+                predictionCoefsWeight,
+                min: MLXArray(-altupCoefClip),
+                max: MLXArray(altupCoefClip)
+            )
+        }
+
+        let allCoefs = predictionCoefs(modalities)
+            .reshaped(
+                Array(modalities.shape.dropLast()) + [config.altupNumInputs, config.altupNumInputs]
+            )
+            .transposed(0, 1, 3, 2)
+
+        let xPermuted = x.asType(.float32).transposed(1, 2, 3, 0)
+        let predictions = matmul(xPermuted, allCoefs)
+        let predictionsPermuted = predictions.transposed(3, 0, 1, 2)
+        let finalPredictions = predictionsPermuted + x
+        return finalPredictions.asType(x.dtype)
+    }
+
+    func correct(predictions: MLXArray, activated: MLXArray) -> MLXArray {
+        let modalities = computeRouterModalities(activated)
+
+        var correctionCoefsWeight = correctionCoefs.weight.asType(.float32)
+
+        if let altupCoefClip = config.altupCoefClip {
+            correctionCoefsWeight = clip(
+                correctionCoefsWeight,
+                min: MLXArray(-altupCoefClip),
+                max: MLXArray(altupCoefClip)
+            )
+        }
+
+        let allCoefs = correctionCoefs(modalities) + 1.0
+
+        let activeX = predictions[config.altupActiveIdx]
+        let innovation = activated - activeX
+
+        let innovationExpanded = expandedDimensions(innovation, axis: 0)
+        let innovationBroadcast = broadcast(
+            innovationExpanded,
+            to: [config.altupNumInputs] + Array(innovation.shape)
+        )
+
+        let allCoefsReshaped = allCoefs.transposed(2, 1, 0)
+        let allCoefsExpanded = expandedDimensions(allCoefsReshaped, axis: 1)
+
+        let corrected = innovationBroadcast * allCoefsExpanded
+        let finalCorrected = corrected + predictions
+
+        return finalCorrected.asType(activated.dtype)
+    }
+
+    func scaleCorrectOutput(_ corrected: MLXArray) -> MLXArray {
+        let scale = config.altupCorrectScale ? correctOutputScale : MLXArray(1.0)
+        return corrected * scale
+    }
+
+    func callAsFunction(_ x: MLXArray, activated: MLXArray) -> (MLXArray, MLXArray) {
+        let predictions = predict(x)
+        let corrected = correct(predictions: predictions, activated: activated)
+        var output = corrected[config.altupActiveIdx]
+        if config.altupCorrectScale {
+            output = scaleCorrectOutput(output)
+        }
+        return (corrected, output)
+    }
+}
+
+private class Gemma3nDecoderLayer: Module {
+    let config: TextConfig
+    let hiddenSize: Int
+    let layerIdx: Int
+    let isSliding: Bool
+    let slidingWindow: Int
+    let hiddenSizePerLayerInput: Int
+
+    @ModuleInfo var selfAttn: Gemma3nAttention
+    @ModuleInfo var mlp: MLP
+    @ModuleInfo var inputLayernorm: Gemma3nRMSNorm
+    @ModuleInfo var postAttentionLayernorm: Gemma3nRMSNorm
+    @ModuleInfo var preFeedforwardLayernorm: Gemma3nRMSNorm
+    @ModuleInfo var postFeedforwardLayernorm: Gemma3nRMSNorm
+    @ModuleInfo var altup: Gemma3nAltUp
+    @ModuleInfo var laurel: Gemma3nLaurelBlock
+    @ModuleInfo var perLayerInputGate: Linear
+    @ModuleInfo var perLayerProjection: Linear
+    @ModuleInfo var postPerLayerInputNorm: Gemma3nRMSNorm
+
+    init(config: TextConfig, layerIdx: Int) {
+        self.config = config
+        self.hiddenSize = config.hiddenSize
+        self.layerIdx = layerIdx
+        self.slidingWindow = config.slidingWindow
+        self.hiddenSizePerLayerInput = config.hiddenSizePerLayerInput
+
+        self._selfAttn.wrappedValue = Gemma3nAttention(config: config, layerIdx: layerIdx)
+        self.isSliding = config.layerTypes[layerIdx] == "sliding_attention"
+
+        self._mlp.wrappedValue = MLP(config: config, layerIdx: layerIdx)
+        self._inputLayernorm.wrappedValue = Gemma3nRMSNorm(
+            dim: hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+
+        self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNorm(
+            dim: hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+        self._preFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(
+            dim: hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+        self._postFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(
+            dim: hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+
+        self._altup.wrappedValue = Gemma3nAltUp(config: config)
+        self._laurel.wrappedValue = Gemma3nLaurelBlock(config: config)
+
+        self._perLayerInputGate.wrappedValue = Linear(
+            hiddenSize,
+            hiddenSizePerLayerInput,
+            bias: false
+        )
+        self._perLayerProjection.wrappedValue = Linear(
+            hiddenSizePerLayerInput,
+            hiddenSize,
+            bias: false
+        )
+        self._postPerLayerInputNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(
+        _ x: MLXArray,
+        mask: MLXFast.ScaledDotProductAttentionMaskMode? = nil,
+        cache: KVCache? = nil,
+        perLayerInput: MLXArray? = nil,
+        caches: [KVCache?]? = nil,
+        cachePosition: MLXArray? = nil,
+        positionEmbeddingsGlobal: (MLXArray, MLXArray)? = nil,
+        positionEmbeddingsLocal: (MLXArray, MLXArray)? = nil
+    ) -> MLXArray {
+        var x = x
+        if x.ndim == 1 {
+            x = expandedDimensions(x, axis: 0)
+        }
+
+        var finalMask = mask
+        if isSliding, case .array(let maskArray) = mask {
+            let effectiveSeqLen = max(cachePosition?.shape[0] ?? 0, slidingWindow)
+            let minDtype = MLXArray(Float.leastNormalMagnitude)
+
+            let slidingWindowMask = tril(
+                MLXArray.ones([maskArray.shape[0], effectiveSeqLen], dtype: .bool),
+                k: -slidingWindow
+            )
+            let updatedMask = MLX.where(slidingWindowMask, minDtype, maskArray)
+
+            let offset = max(0, (cachePosition?.max().item() ?? 0) - effectiveSeqLen + 1)
+            let maskIndexes = MLXArray(0 ..< min(effectiveSeqLen, updatedMask.shape.last!)) + offset
+            let slicedMask = take(updatedMask, maskIndexes.asType(.int32), axis: -1)
+            finalMask = .array(slicedMask)
+        }
+
+        let predictions = altup.predict(x)
+        let activePrediction = predictions[config.altupActiveIdx]
+
+        let activePredictionNormed = inputLayernorm(activePrediction)
+        let laurelOutput = laurel(activePredictionNormed)
+
+        let positionEmbeddings = isSliding ? positionEmbeddingsLocal : positionEmbeddingsGlobal
+
+        let attn = selfAttn(
+            activePredictionNormed,
+            mask: finalMask,
+            cache: cache,
+            caches: caches,
+            positionEmbeddings: positionEmbeddings,
+            cachePosition: cachePosition
+        )
+
+        let attnNormed = postAttentionLayernorm(attn)
+        let attnGated = activePrediction + attnNormed
+        let attnLaurel =
+            (attnGated + laurelOutput) / sqrt(MLXArray(2.0, dtype: activePrediction.dtype))
+
+        let attnNormFf = preFeedforwardLayernorm(attnLaurel)
+        let attnFfw = mlp(attnNormFf)
+        let attnFfwNorm = postFeedforwardLayernorm(attnFfw)
+        let attnFfwLaurelGated = attnLaurel + attnFfwNorm
+
+        var correctedPredictions = altup.correct(
+            predictions: predictions, activated: attnFfwLaurelGated)
+
+        var firstPrediction = correctedPredictions[config.altupActiveIdx]
+        if config.altupCorrectScale {
+            firstPrediction = altup.scaleCorrectOutput(firstPrediction)
+        }
+
+        firstPrediction = perLayerInputGate(firstPrediction)
+        firstPrediction = geluApproximate(firstPrediction)
+
+        // Per-layer input multiplication is always performed in the Python version
+        guard let perLayerInput = perLayerInput else {
+            fatalError(
+                "per_layer_input is required but was nil. This should never happen in normal operation."
+            )
+        }
+        firstPrediction = firstPrediction * perLayerInput
+
+        firstPrediction = perLayerProjection(firstPrediction)
+        firstPrediction = postPerLayerInputNorm(firstPrediction)
+
+        for i in 1 ..< correctedPredictions.shape[0] {
+            correctedPredictions[i] = correctedPredictions[i] + firstPrediction
+        }
+
+        return correctedPredictions
+    }
+}
+
+private class Gemma3nTextScaledWordEmbedding: Module, UnaryLayer {
+    @ModuleInfo var weight: MLXArray
+    let embedScale: Float
+
+    init(numEmbeddings: Int, embeddingDim: Int, embedScale: Float = 1.0) {
+        self.embedScale = embedScale
+        self._weight.wrappedValue = MLXRandom.normal([numEmbeddings, embeddingDim])
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let indices = x.asType(.int32)
+        let embeddings = take(weight, indices, axis: 0)
+        return embeddings * MLXArray(embedScale, dtype: .float32).asType(weight.dtype)
+    }
+}
+
+private class Gemma3Model: Module {
+    let config: TextConfig
+    let hiddenSize: Int
+    let vocabSize: Int
+    let vocabSizePerLayerInput: Int
+    let numHiddenLayers: Int
+    let perLayerProjectionScale: MLXArray
+    let perLayerInputScale: MLXArray
+
+    @ModuleInfo var embedTokens: Gemma3nTextScaledWordEmbedding
+    @ModuleInfo var layers: [Gemma3nDecoderLayer]
+    @ModuleInfo var embedTokensPerLayer: Gemma3nTextScaledWordEmbedding
+    @ModuleInfo var perLayerModelProjection: Linear
+    @ModuleInfo var perLayerProjectionNorm: Gemma3nRMSNorm
+    @ModuleInfo var altupProjections: [Linear]
+    @ModuleInfo var altupUnembedProjections: [Linear]
+    @ModuleInfo var norm: Gemma3nRMSNorm
+    @ModuleInfo var ropeEmbedding: Gemma3nRotaryEmbedding
+    @ModuleInfo var ropeEmbeddingLocal: Gemma3nRotaryEmbedding
+
+    init(config: TextConfig) {
+        self.config = config
+        self.hiddenSize = config.hiddenSize
+        self.vocabSize = config.vocabSize
+        self.vocabSizePerLayerInput = config.vocabSizePerLayerInput
+        self.numHiddenLayers = config.numHiddenLayers
+
+        assert(vocabSize > 0)
+
+        self._embedTokens.wrappedValue = Gemma3nTextScaledWordEmbedding(
+            numEmbeddings: config.vocabSize,
+            embeddingDim: config.hiddenSize,
+            embedScale: pow(Float(config.hiddenSize), 0.5)
+        )
+
+        self._layers.wrappedValue = (0 ..< config.numHiddenLayers).map { layerIdx in
+            Gemma3nDecoderLayer(config: config, layerIdx: layerIdx)
+        }
+
+        self._embedTokensPerLayer.wrappedValue = Gemma3nTextScaledWordEmbedding(
+            numEmbeddings: config.vocabSizePerLayerInput,
+            embeddingDim: config.numHiddenLayers * config.hiddenSizePerLayerInput,
+            embedScale: pow(Float(config.hiddenSizePerLayerInput), 0.5)
+        )
+
+        self._perLayerModelProjection.wrappedValue = Linear(
+            config.hiddenSize,
+            config.numHiddenLayers * config.hiddenSizePerLayerInput,
+            bias: false
+        )
+
+        self._perLayerProjectionNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.hiddenSizePerLayerInput,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+
+        self._altupProjections.wrappedValue = (1 ..< config.altupNumInputs).map { _ in
+            Linear(config.hiddenSize, config.hiddenSize, bias: false)
+        }
+
+        self._altupUnembedProjections.wrappedValue = (1 ..< config.altupNumInputs).map { _ in
+            Linear(config.hiddenSize, config.hiddenSize, bias: false)
+        }
+
+        self._norm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.hiddenSize,
+            eps: config.rmsNormEps,
+            scaleShift: 0.0,
+            withScale: true
+        )
+
+        self.perLayerProjectionScale = MLXArray(pow(Float(hiddenSize), -0.5))
+        self.perLayerInputScale = rsqrt(MLXArray(2.0))
+
+        self._ropeEmbedding.wrappedValue = Gemma3nRotaryEmbedding(config: config)
+
+        var localConfig = config
+        // Note: Creating a modified copy for local rope - this is a simplification
+        // In actual implementation, we'd need to handle the rope_local_base_freq properly
+        self._ropeEmbeddingLocal.wrappedValue = Gemma3nRotaryEmbedding(config: localConfig)
+
+        super.init()
+    }
+
+    func callAsFunction(
+        inputs: MLXArray? = nil,
+        inputsEmbeds: MLXArray? = nil,
+        mask: MLXFast.ScaledDotProductAttentionMaskMode? = nil,
+        cache: [KVCache?]? = nil,
+        perLayerInputs: MLXArray? = nil
+    ) -> MLXArray {
+        var h: MLXArray
+        if let inputsEmbeds = inputsEmbeds {
+            h = inputsEmbeds
+        } else if let inputs = inputs {
+            h = embedTokens(inputs)
+        } else {
+            fatalError("Either inputs or inputsEmbeds must be provided")
+        }
+
+        let perLayerInputsProcessed: MLXArray
+        if let perLayerInputs = perLayerInputs {
+            perLayerInputsProcessed = perLayerInputs
+        } else if let inputs = inputs {
+            perLayerInputsProcessed = getPerLayerInputs(inputs)
+        } else {
+            fatalError("Cannot generate per layer inputs without input ids")
+        }
+
+        let finalPerLayerInputs = projectPerLayerInputs(h, perLayerInputs: perLayerInputsProcessed)
+
+        let cacheArray = cache ?? Array(repeating: nil as KVCache?, count: layers.count)
+
+        let pastSeenTokens = cacheArray.first??.offset ?? 0
+        let cachePosition = MLXArray(pastSeenTokens ..< (pastSeenTokens + h.shape[1]))
+
+        var fullMask: MLXFast.ScaledDotProductAttentionMaskMode = .none
+        var slidingWindowMask: MLXFast.ScaledDotProductAttentionMaskMode = .none
+
+        if mask == nil {
+            let j = config.slidingWindowPattern
+            if j > 0 && j <= cacheArray.count {
+                let globalCacheSlice = Array(cacheArray[(j - 1) ..< j]).compactMap { $0 }
+                fullMask = createAttentionMask(h: h, cache: globalCacheSlice, returnArray: true)
+            }
+            slidingWindowMask = createAttentionMask(
+                h: h, cache: cacheArray.compactMap { $0 }, returnArray: true)
+        }
+
+        let h0 = h
+
+        let positionIds = expandedDimensions(cachePosition, axis: 0)
+        let positionEmbeddingsGlobal = ropeEmbedding(h0, positionIds: positionIds)
+        let positionEmbeddingsLocal = ropeEmbeddingLocal(h0, positionIds: positionIds)
+
+        let targetMagnitude = pow(mean(h0.square(), axis: -1, keepDims: true), 0.5)
+        let epsilonTensor = MLXArray(Float.leastNormalMagnitude, dtype: h0.dtype)
+
+        var hList = Array(repeating: h0, count: config.altupNumInputs)
+
+        for i in 1 ..< config.altupNumInputs {
+            let altupProj = altupProjections[i - 1](hList[i])
+            hList[i] = altupProj.asType(h0.dtype)
+            let newMagnitude = pow(mean(hList[i].square(), axis: -1, keepDims: true), 0.5)
+            hList[i] = hList[i] * (targetMagnitude / maximum(newMagnitude, epsilonTensor))
+        }
+
+        h = stacked(hList, axis: 0)
+
+        for (i, (layer, c)) in zip(layers[..<config.numHiddenLayers], cacheArray).enumerated() {
+            let perLayerInput = finalPerLayerInputs[0..., 0..., i, 0...]
+
+            let isGlobal = config.layerTypes[i] == "global_attention"
+
+            let localMask: MLXFast.ScaledDotProductAttentionMaskMode
+            if let mask = mask {
+                localMask = mask
+            } else if isGlobal {
+                localMask = fullMask
+            } else {
+                localMask = slidingWindowMask
+            }
+
+            h = layer(
+                h,
+                mask: localMask,
+                cache: c,
+                perLayerInput: perLayerInput,
+                caches: cacheArray,
+                cachePosition: cachePosition,
+                positionEmbeddingsGlobal: positionEmbeddingsGlobal,
+                positionEmbeddingsLocal: positionEmbeddingsLocal
+            )
+        }
+
+        let targetMagnitudeFinal = pow(mean(h[0].square(), axis: -1, keepDims: true), 0.5)
+
+        for i in 1 ..< config.altupNumInputs {
+            let altupUnembProj = altupUnembedProjections[i - 1](h[i])
+            h[i] = altupUnembProj.asType(h0.dtype)
+            let newMagnitude = pow(mean(h[i].square(), axis: -1, keepDims: true), 0.5)
+            h[i] = h[i] * (targetMagnitudeFinal / maximum(newMagnitude, epsilonTensor))
+        }
+
+        h = mean(h, axis: 0)
+
+        return norm(h)
+    }
+
+    func getPerLayerInputs(_ inputIds: MLXArray) -> MLXArray {
+        let perLayerInputsMask = logicalAnd(
+            inputIds .>= 0,
+            inputIds .< vocabSizePerLayerInput
+        )
+        let tokens = MLX.where(perLayerInputsMask, inputIds, MLXArray.zeros(like: inputIds))
+        let result = embedTokensPerLayer(tokens).reshaped(
+            Array(inputIds.shape) + [config.numHiddenLayers, config.hiddenSizePerLayerInput]
+        )
+        return result
+    }
+
+    func projectPerLayerInputs(_ inputsEmbeds: MLXArray, perLayerInputs: MLXArray?) -> MLXArray {
+        var perLayerProjection = perLayerModelProjection(inputsEmbeds)
+        perLayerProjection = perLayerProjection * perLayerProjectionScale.asType(inputsEmbeds.dtype)
+
+        perLayerProjection = perLayerProjection.reshaped(
+            Array(inputsEmbeds.shape.dropLast()) + [
+                config.numHiddenLayers, config.hiddenSizePerLayerInput,
+            ]
+        )
+        perLayerProjection = perLayerProjectionNorm(perLayerProjection)
+
+        guard let perLayerInputs = perLayerInputs else {
+            return perLayerProjection
+        }
+
+        var adjustedPerLayerInputs = perLayerInputs
+        if perLayerProjection.shape != perLayerInputs.shape {
+            let targetLayers = min(
+                config.numHiddenLayers, perLayerInputs.shape[perLayerInputs.shape.count - 2])
+            adjustedPerLayerInputs = perLayerInputs[.ellipsis, ..<targetLayers, 0...]
+        }
+
+        return (perLayerProjection + adjustedPerLayerInputs)
+            * perLayerInputScale.asType(inputsEmbeds.dtype)
+    }
+}
+
+private class LanguageModel: Module, KVCacheDimensionProvider {
+    let config: TextConfig
+    let modelType: String
+    let finalLogitSoftcapping: Float?
+    let textVocabSize: Int
+
+    @ModuleInfo var model: Gemma3Model
+    @ModuleInfo var lmHead: Linear
+
+    var kvHeads: [Int]
+
+    init(config: TextConfig) {
+        self.config = config
+        self.modelType = config.modelType
+        self.finalLogitSoftcapping = config.finalLogitSoftcapping
+        self.textVocabSize = config.vocabSizePerLayerInput
+
+        self._model.wrappedValue = Gemma3Model(config: config)
+        self._lmHead.wrappedValue = Linear(config.hiddenSize, config.vocabSize, bias: false)
+
+        self.kvHeads = Array(repeating: config.numKeyValueHeads, count: config.numHiddenLayers)
+
+        super.init()
+    }
+
+    public func newCache(parameters: GenerateParameters?) -> [any KVCache] {
+        var caches: [any KVCache] = []
+        let slidingWindow = config.slidingWindow > 0 ? config.slidingWindow : 4096
+        let slidingWindowPattern = config.slidingWindowPattern
+
+        for i in 0 ..< config.numHiddenLayers {
+            let isGlobalLayer = (i % slidingWindowPattern == slidingWindowPattern - 1)
+            if isGlobalLayer {
+                caches.append(StandardKVCache())
+            } else {
+                caches.append(RotatingKVCache(maxSize: slidingWindow, keep: 0))
+            }
+        }
+        return caches
+    }
+
+    func callAsFunction(
+        inputs: MLXArray? = nil,
+        inputsEmbeds: MLXArray? = nil,
+        mask: MLXFast.ScaledDotProductAttentionMaskMode? = nil,
+        cache: [KVCache?]? = nil,
+        perLayerInputs: MLXArray? = nil
+    ) -> LMOutput {
+        let out = model(
+            inputs: inputs,
+            inputsEmbeds: inputsEmbeds,
+            mask: mask,
+            cache: cache,
+            perLayerInputs: perLayerInputs
+        )
+        var finalLogits = lmHead(out)
+
+        if let softcap = finalLogitSoftcapping, softcap > 0 {
+            finalLogits = tanh(finalLogits / softcap) * softcap
+        }
+
+        return LMOutput(logits: finalLogits)
+    }
+
+    func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var sanitizedWeights = [String: MLXArray]()
+
+        for (k, v) in weights {
+            if k.contains("self_attn.rotary_emb.inv_freq") {
+                continue
+            } else if !k.contains("language_model.model") && !k.contains("language_model.lm_head") {
+                let newKey = k.replacingOccurrences(
+                    of: "language_model", with: "language_model.model")
+                sanitizedWeights[newKey] = v
+            } else {
+                sanitizedWeights[k] = v
+            }
+        }
+
+        if sanitizedWeights["language_model.lm_head.weight"] == nil {
+            let embedTokensKey = "language_model.model.embed_tokens.weight"
+            if let embedWeight = sanitizedWeights[embedTokensKey] {
+                sanitizedWeights["language_model.lm_head.weight"] = embedWeight
+            }
+        }
+
+        return sanitizedWeights
+    }
+}
+
+// MARK: - Multimodal Embedder
+
+private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
+    let multimodalHiddenSize: Int
+    let eps: Float
+    let vocabOffset: Int
+    let vocabSize: Int
+    let textHiddenSize: Int
+
+    @ModuleInfo var embedding: Embedding
+    @ModuleInfo var hardEmbeddingNorm: Gemma3nRMSNorm
+    @ModuleInfo var softEmbeddingNorm: Gemma3nRMSNorm
+    @ModuleInfo var embeddingProjection: Linear
+    @ModuleInfo var embeddingPostProjectionNorm: Gemma3nRMSNorm
+
+    init(multimodalConfig: any MultimodalConfig, textConfig: TextConfig) {
+        self.multimodalHiddenSize = multimodalConfig.hiddenSize
+        self.eps = multimodalConfig.rmsNormEps
+        self.vocabOffset = multimodalConfig.vocabOffset
+        self.vocabSize = multimodalConfig.vocabSize
+        self.textHiddenSize = textConfig.hiddenSize
+
+        self._embedding.wrappedValue = Embedding(
+            embeddingCount: vocabSize,
+            dimensions: multimodalHiddenSize
+        )
+        self._hardEmbeddingNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: multimodalHiddenSize,
+            eps: eps
+        )
+        self._softEmbeddingNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: multimodalHiddenSize,
+            eps: eps
+        )
+        self._embeddingProjection.wrappedValue = Linear(
+            multimodalHiddenSize,
+            textHiddenSize,
+            bias: false
+        )
+        self._embeddingPostProjectionNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: textHiddenSize,
+            eps: eps,
+            withScale: false
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(_ inputIds: MLXArray?, inputsEmbeds: MLXArray?) -> MLXArray {
+        guard (inputIds == nil) != (inputsEmbeds == nil) else {
+            fatalError("You must specify exactly one of inputIds or inputsEmbeds")
+        }
+
+        let embNorm: MLXArray
+        if let inputsEmbeds = inputsEmbeds {
+            embNorm = softEmbeddingNorm(inputsEmbeds)
+        } else if let inputIds = inputIds {
+            let hardEmb = embedding(inputIds - vocabOffset)
+            embNorm = hardEmbeddingNorm(hardEmb)
+        } else {
+            fatalError("Either inputIds or inputsEmbeds must be provided")
+        }
+
+        let embNormProj = embeddingProjection(embNorm)
+        let projected = embeddingPostProjectionNorm(embNormProj)
+        return projected
+    }
+
+    func callAsFunction(_ inputIds: MLXArray) -> MLXArray {
+        return callAsFunction(inputIds, inputsEmbeds: nil)
+    }
+}
+
+// MARK: - Helper Functions
+
+// MARK: - Custom Attention for Gemma3n with Logit Softcapping
+
+/// Custom attention function for Gemma3n that supports:
+/// - Logit softcapping (applied before softmax)
+/// - Standard KV cache support
+/// - Exact alignment with Python implementation
+///
+/// TODO: Quantized KV Cache Integration
+/// Action items for adding quantized cache support:
+/// 1. Add QuantizedKVCache detection: `if let quantizedKVCache = cache as? QuantizedKVCache`
+/// 2. Use quantizedKVCache.updateQuantized(keys: keys, values: values) for cache update
+/// 3. Implement manual quantized attention computation with logit softcapping:
+///    - Cannot use quantizedScaledDotProductAttention directly (no softcapping support)
+///    - Need to manually compute: matmul(queries, dequantized_keys) with softcapping
+///    - May require dequantization of keys for logit softcapping application
+/// 4. Consider performance trade-offs:
+///    - Manual dequantization vs quantized attention benefits
+///    - Might need hybrid approach or dedicated quantized+softcapping function
+/// 5. Test with QuantizedKVCache to ensure numerical accuracy matches Python
+/// 6. Update documentation and examples
+private func gemma3nAttentionWithCacheUpdate(
+    queries: MLXArray,
+    keys: MLXArray,
+    values: MLXArray,
+    cache: KVCache?,
+    scale: Float,
+    attnLogitSoftcapping: Float,
+    mask: MLXFast.ScaledDotProductAttentionMaskMode = .none
+) -> MLXArray {
+    // Update cache and get cached keys/values (matches Python's cache.update_and_fetch)
+    let (cachedKeys, cachedValues): (MLXArray, MLXArray)
+
+    if let cache = cache {
+        (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)
+    } else {
+        (cachedKeys, cachedValues) = (keys, values)
+    }
+
+    // Manual attention computation to support logit softcapping
+    // This matches the Python implementation exactly:
+    // attn_weights = mx.matmul(queries, keys.swapaxes(2, 3)) * self.scale
+    var attnWeights = matmul(queries, cachedKeys.swappedAxes(2, 3)) * scale
+
+    // Apply logit softcapping if enabled (matches Python)
+    // if self.attn_logit_softcapping is not None and self.attn_logit_softcapping > 0:
+    if attnLogitSoftcapping > 0 {
+        attnWeights = attnWeights / attnLogitSoftcapping
+        attnWeights = tanh(attnWeights)
+        attnWeights = attnWeights * attnLogitSoftcapping
+    }
+
+    // Apply mask if provided (matches Python)
+    // if mask is not None: causal_mask = mask[:, : keys.shape[-2]]
+    if case .array(let maskArray) = mask {
+        let causalMask = maskArray[0..., ..<cachedKeys.shape[2]]
+        attnWeights = attnWeights + causalMask
+    }
+
+    // Apply softmax and compute output (matches Python)
+    // attn_weights = mx.softmax(attn_weights.astype(mx.float32), axis=-1).astype(queries.dtype)
+    attnWeights = softmax(attnWeights.asType(.float32), axis: -1).asType(queries.dtype)
+
+    // output = mx.matmul(attn_weights, values)
+    return matmul(attnWeights, cachedValues)
+}
+
+private func bicubicInterpolate(
+    _ x: MLXArray, to targetSize: (Int, Int), alignCorners: Bool = false
+) -> MLXArray {
+    // TODO: This implementation uses nested loops and sequential MLX operations, which is much slower
+    // than the Python version that uses mx.fast.metal_kernel() for parallel GPU computation.
+    // MLX Swift currently doesn't have custom Metal kernel creation capabilities like Python's
+    // mx.fast.metal_kernel(). Consider optimizing with vectorized MLX operations or requesting
+    // custom kernel support from the MLX Swift team for better performance.
+
+    // Input: NHWC format [batch, height, width, channels]
+    // Output: NHWC format [batch, target_height, target_width, channels]
+
+    let inputShape = x.shape
+    let (batchSize, inputHeight, inputWidth, channels) = (
+        inputShape[0], inputShape[1], inputShape[2], inputShape[3]
+    )
+    let (targetHeight, targetWidth) = targetSize
+
+    // If no resizing needed, return input
+    if inputHeight == targetHeight && inputWidth == targetWidth {
+        return x
+    }
+
+    // Convert to float32 for computation if needed
+    let inputDtype = x.dtype
+    let xFloat = x.asType(.float32)
+
+    // Calculate scale factors
+    let scaleH: Float
+    let scaleW: Float
+
+    if alignCorners && targetHeight > 1 && targetWidth > 1 {
+        scaleH = Float(inputHeight - 1) / Float(targetHeight - 1)
+        scaleW = Float(inputWidth - 1) / Float(targetWidth - 1)
+    } else {
+        scaleH = Float(inputHeight) / Float(targetHeight)
+        scaleW = Float(inputWidth) / Float(targetWidth)
+    }
+
+    // Bicubic kernel function (matches Python implementation with a=-0.5)
+    func cubicKernel(_ x: Float) -> Float {
+        let absx = abs(x)
+        let absx2 = absx * absx
+        let absx3 = absx2 * absx
+        let a: Float = -0.5
+
+        if absx <= 1.0 {
+            return (a + 2.0) * absx3 - (a + 3.0) * absx2 + 1.0
+        } else if absx < 2.0 {
+            return a * absx3 - 5.0 * a * absx2 + 8.0 * a * absx - 4.0 * a
+        }
+        return 0.0
+    }
+
+    // Create output array
+    var result = MLXArray.zeros(
+        [batchSize, targetHeight, targetWidth, channels], type: Float32.self)
+
+    // Process each output pixel
+    for outY in 0 ..< targetHeight {
+        for outX in 0 ..< targetWidth {
+            // Calculate input coordinates
+            let inY: Float
+            let inX: Float
+
+            if alignCorners && targetHeight > 1 && targetWidth > 1 {
+                inY = Float(outY) * scaleH
+                inX = Float(outX) * scaleW
+            } else {
+                inY = (Float(outY) + 0.5) * scaleH - 0.5
+                inX = (Float(outX) + 0.5) * scaleW - 0.5
+            }
+
+            // Get integer and fractional parts
+            let y0 = Int(floor(inY))
+            let x0 = Int(floor(inX))
+            let yFrac = inY - Float(y0)
+            let xFrac = inX - Float(x0)
+
+            // Bicubic interpolation with 4x4 neighborhood
+            var interpolatedPixel = MLXArray.zeros([batchSize, channels], type: Float32.self)
+            var weightSum: Float = 0.0
+
+            for i in -1 ... 2 {
+                let yPos = max(0, min(y0 + i, inputHeight - 1))
+                let wy = cubicKernel(yFrac - Float(i))
+
+                for j in -1 ... 2 {
+                    let xPos = max(0, min(x0 + j, inputWidth - 1))
+                    let wx = cubicKernel(xFrac - Float(j))
+                    let weight = wy * wx
+
+                    if weight != 0.0 {
+                        let pixelValue = xFloat[0..., yPos, xPos, 0...]
+                        interpolatedPixel = interpolatedPixel + pixelValue * weight
+                        weightSum += weight
+                    }
+                }
+            }
+
+            // Normalize by weight sum
+            if weightSum > 0.0 {
+                interpolatedPixel = interpolatedPixel / weightSum
+            }
+
+            // Set the result
+            result[0..., outY, outX, 0...] = interpolatedPixel
+        }
+    }
+
+    // Convert back to original dtype
+    return result.asType(inputDtype)
+}
+
+private func maskedScatter(
+    inputTensor: MLXArray,
+    mask: MLXArray,
+    source: MLXArray
+) -> MLXArray {
+    let maskBool = mask.asType(.bool)
+
+    if !maskBool.any().item() {
+        return broadcast(inputTensor, to: mask.shape)
+    }
+
+    let inputShape = mask.shape
+    var resultFlat = broadcast(inputTensor, to: inputShape).flattened()
+    let maskFlat = maskBool.flattened()
+    let sourceFlat = source.flattened()
+
+    let selectionMask = cumsum(maskFlat.asType(.int32)) - 1
+    let sourceLen = sourceFlat.shape[0]
+    let boundedIndices = selectionMask % sourceLen
+
+    let selectedValues = take(sourceFlat, boundedIndices, axis: 0)
+    resultFlat = MLX.where(maskFlat, selectedValues, resultFlat)
+
+    return resultFlat.reshaped(inputShape)
+}
+
+private func checkArrayShape(_ arr: MLXArray) -> Bool {
+    let shape = arr.shape
+    guard shape.count == 4 else { return false }
+
+    let (outChannels, kH, kW, _) = (shape[0], shape[1], shape[2], shape[3])
+    return (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
+}
+
+// MARK: - Main Model
+
+public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
+    @ModuleInfo private var languageModel: LanguageModel
+    @ModuleInfo private var visionTower: Gemma3nVisionModel
+    @ModuleInfo private var audioTower: Gemma3nAudioModel
+    @ModuleInfo private var embedVision: Gemma3nMultimodalEmbedder
+    @ModuleInfo private var embedAudio: Gemma3nMultimodalEmbedder
+
+    public let config: ModelConfig
+
+    public var vocabularySize: Int { config.vocabSize }
+    public var kvHeads: [Int] { languageModel.kvHeads }
+
+    public func newCache(parameters: GenerateParameters?) -> [any KVCache] {
+        return languageModel.newCache(parameters: parameters)
+    }
+
+    public init(_ config: ModelConfig) {
+        self.config = config
+
+        self._languageModel.wrappedValue = LanguageModel(config: config.textConfig)
+        self._visionTower.wrappedValue = Gemma3nVisionModel(config: config.visionConfig)
+        self._audioTower.wrappedValue = Gemma3nAudioModel(config: config.audioConfig)
+
+        self._embedVision.wrappedValue = Gemma3nMultimodalEmbedder(
+            multimodalConfig: config.visionConfig,
+            textConfig: config.textConfig
+        )
+        self._embedAudio.wrappedValue = Gemma3nMultimodalEmbedder(
+            multimodalConfig: config.audioConfig,
+            textConfig: config.textConfig
+        )
+
+        super.init()
+    }
+
+    func getInputEmbeddings(
+        inputIds: MLXArray? = nil,
+        pixelValues: MLXArray? = nil,
+        inputFeatures: MLXArray? = nil,
+        inputFeaturesMask: MLXArray? = nil
+    ) -> MLXArray {
+        if pixelValues == nil && inputFeatures == nil {
+            return languageModel.model.embedTokens(inputIds!)
+        }
+
+        guard let inputIds = inputIds else {
+            fatalError("Input IDs required for multimodal input")
+        }
+
+        var inputsEmbeds = languageModel.model.embedTokens(inputIds)
+
+        // Ensure no gaps between text, vision, and audio embeddings, in that order
+        // This matches the Python assertion
+        assert(
+            embedAudio.vocabOffset == config.vocabSize - config.audioConfig.vocabSize,
+            "Audio vocab offset mismatch"
+        )
+        assert(
+            embedVision.vocabOffset == config.vocabSize - config.audioConfig.vocabSize
+                - config.visionConfig.vocabSize,
+            "Vision vocab offset mismatch"
+        )
+
+        // Handle vision tokens
+        if pixelValues != nil {
+            let visionMask = logicalAnd(
+                inputIds .>= config.visionConfig.vocabOffset,
+                inputIds .< config.audioConfig.vocabOffset
+            )
+
+            if visionMask.any().item() {
+                let visionTokens = MLX.where(visionMask, inputIds, MLXArray.zeros(like: inputIds))
+                let visionEmbedsFlat = embedVision(visionTokens)
+                inputsEmbeds = MLX.where(
+                    expandedDimensions(visionMask, axis: -1),
+                    visionEmbedsFlat,
+                    inputsEmbeds
+                )
+            }
+        }
+
+        // Handle audio tokens
+        if inputFeatures != nil {
+            let audioMask = inputIds .>= config.audioConfig.vocabOffset
+
+            if audioMask.any().item() {
+                let audioTokens = MLX.where(audioMask, inputIds, MLXArray.zeros(like: inputIds))
+                let audioEmbedsFlat = embedAudio(audioTokens)
+                inputsEmbeds = MLX.where(
+                    expandedDimensions(audioMask, axis: -1),
+                    audioEmbedsFlat,
+                    inputsEmbeds
+                )
+            }
+        }
+
+        // Process vision features
+        if let pixelValues = pixelValues {
+            let pixelValuesTyped = pixelValues.asType(languageModel.model.embedTokens.weight.dtype)
+            let imageFeatures = getImageFeatures(pixelValuesTyped)
+
+            return mergeMultimodalAndText(
+                inputIds: inputIds,
+                inputsEmbeds: inputsEmbeds,
+                features: imageFeatures,
+                tokenId: config.imageTokenId,
+                modality: "image"
+            )
+        }
+
+        // Process audio features
+        if let inputFeatures = inputFeatures, let inputFeaturesMask = inputFeaturesMask {
+            let (audioFeatures, audioMask) = getAudioFeatures(inputFeatures, .!inputFeaturesMask)
+            let audioPaddingIds = MLXArray([config.vocabSize - 1]).expandedDimensions(axis: 0)
+            let audioPaddingEmbs = embedAudio(audioPaddingIds)
+
+            let maskedAudioFeatures = MLX.where(
+                expandedDimensions(audioMask, axis: -1),
+                audioPaddingEmbs,
+                audioFeatures
+            )
+
+            let audioBatchSize = maskedAudioFeatures.shape[0]
+            let audioSeqLen = maskedAudioFeatures.shape[1]
+            let audioEmbedDim = maskedAudioFeatures.shape[2]
+            let extraPaddingTokens = config.audioSoftTokensPerImage - audioSeqLen
+
+            let extraPaddingFeatures = broadcast(
+                audioPaddingEmbs,
+                to: [audioBatchSize, extraPaddingTokens, audioEmbedDim]
+            )
+
+            let finalAudioFeatures = concatenated(
+                [maskedAudioFeatures, extraPaddingFeatures], axis: 1)
+
+            return mergeMultimodalAndText(
+                inputIds: inputIds,
+                inputsEmbeds: inputsEmbeds,
+                features: finalAudioFeatures,
+                tokenId: config.audioTokenId,
+                modality: "audio"
+            )
+        }
+
+        return inputsEmbeds
+    }
+
+    func getAudioFeatures(_ inputFeatures: MLXArray, _ inputFeaturesMask: MLXArray) -> (
+        MLXArray, MLXArray
+    ) {
+        let (audioOutputs, audioMask) = audioTower(inputFeatures, inputFeaturesMask)
+        return (embedAudio(nil, inputsEmbeds: audioOutputs), audioMask)
+    }
+
+    func getImageFeatures(_ pixelValues: MLXArray) -> MLXArray {
+        let visionOutputs = visionTower(pixelValues, outputHiddenStates: true)
+
+        // Python: vision_outputs.transpose(0, 3, 1, 2) - NHWC -> NCHW
+        let visionOutputsNCHW = visionOutputs.transposed(0, 3, 1, 2)
+
+        // Python: reshape and transpose to get [batch, tokens, features]
+        let reshaped = visionOutputsNCHW.reshaped([
+            visionOutputsNCHW.shape[0],
+            config.visionConfig.hiddenSize,
+            config.visionSoftTokensPerImage,
+        ]).transposed(0, 2, 1)
+
+        // Normalize and embed the soft tokens into language model space
+        let scaledOutputs = reshaped * pow(Float(config.visionConfig.hiddenSize), 0.5)
+        return embedVision(nil, inputsEmbeds: scaledOutputs)
+    }
+
+    func mergeMultimodalAndText(
+        inputIds: MLXArray?,
+        inputsEmbeds: MLXArray,
+        features: MLXArray,
+        tokenId: Int,
+        modality: String
+    ) -> MLXArray {
+        let specialModalityMask: MLXArray
+
+        if let inputIds = inputIds {
+            specialModalityMask = expandedDimensions(inputIds .== tokenId, axis: -1)
+        } else {
+            // When inputIds is nil, create mask by comparing embeddings
+            let embedFn: (MLXArray) -> MLXArray =
+                modality == "audio"
+                ? { self.embedAudio($0, inputsEmbeds: nil) }
+                : { self.languageModel.model.embedTokens($0) }
+            let tokenEmbedding = embedFn(MLXArray([tokenId]))
+            specialModalityMask = inputsEmbeds .== tokenEmbedding
+        }
+
+        let specialModalityMaskBroadcast = broadcast(specialModalityMask, to: inputsEmbeds.shape)
+
+        let modalityTokensInText = specialModalityMaskBroadcast.sum().item(Int.self)
+        let featureTokens = features.size
+
+        guard modalityTokensInText == featureTokens else {
+            fatalError(
+                """
+                Number of \(modality)s does not match number of special \(modality) tokens in the input text.
+                Got \(modalityTokensInText) \(modality) tokens in the text and \(featureTokens) tokens from \(modality) embeddings.
+                """)
+        }
+
+        let featuresTyped = features.asType(inputsEmbeds.dtype)
+        return maskedScatter(
+            inputTensor: inputsEmbeds, mask: specialModalityMaskBroadcast, source: featuresTyped)
+    }
+
+    public func prepare(_ input: LMInput, cache: [any KVCache], windowSize: Int?) throws
+        -> PrepareResult
+    {
+        let inputIds = input.text.tokens
+        let pixelValues = input.image?.pixels
+
+        let inputsEmbeds = getInputEmbeddings(
+            inputIds: inputIds,
+            pixelValues: pixelValues
+        )
+
+        let perLayerInputs = languageModel.model.getPerLayerInputs(inputIds)
+        let convertedCache = cache.compactMap { $0 as? KVCache }
+
+        let result = languageModel(
+            inputs: nil,
+            inputsEmbeds: inputsEmbeds,
+            mask: .causal,
+            cache: convertedCache,
+            perLayerInputs: perLayerInputs
+        )
+
+        return .logits(result)
+    }
+
+    public func callAsFunction(_ inputs: MLXArray, cache: [any KVCache]?) -> MLXArray {
+        let convertedCache = cache?.compactMap { $0 as? KVCache }
+        return languageModel(inputs: inputs, cache: convertedCache).logits
+    }
+
+    public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var processedWeights = languageModel.sanitize(weights: weights)
+        processedWeights = visionTower.sanitize(weights: processedWeights)
+        processedWeights = audioTower.sanitize(weights: processedWeights)
+
+        var sanitizedWeights = [String: MLXArray]()
+        for (k, v) in processedWeights {
+            if k.hasPrefix("model.") {
+                sanitizedWeights[String(k.dropFirst(6))] = v
+            } else {
+                sanitizedWeights[k] = v
+            }
+        }
+
+        return sanitizedWeights
+    }
+
+    public static func fromPretrained(pathOrHfRepo: String) throws -> Gemma3n {
+        let path = URL(fileURLWithPath: pathOrHfRepo)
+
+        // Load config
+        let configPath = path.appendingPathComponent("config.json")
+        let configData = try Data(contentsOf: configPath)
+        let configDict = try JSONSerialization.jsonObject(with: configData) as! [String: Any]
+
+        // Create nested configs
+        let textConfig = try JSONDecoder().decode(
+            TextConfig.self,
+            from: JSONSerialization.data(withJSONObject: configDict["text_config"]!))
+        let visionConfig = try JSONDecoder().decode(
+            VisionConfig.self,
+            from: JSONSerialization.data(withJSONObject: configDict["vision_config"]!))
+        let audioConfig = try JSONDecoder().decode(
+            AudioConfig.self,
+            from: JSONSerialization.data(withJSONObject: configDict["audio_config"]!))
+
+        let modelConfig = ModelConfig(
+            textConfig: textConfig,
+            visionConfig: visionConfig,
+            audioConfig: audioConfig,
+            modelType: configDict["model_type"] as? String ?? "gemma3n",
+            vocabSize: configDict["vocab_size"] as? Int ?? 257152,
+            ignoreIndex: configDict["ignore_index"] as? Int ?? -100,
+            imageTokenIndex: configDict["image_token_index"] as? Int ?? 262145,
+            audioTokenId: configDict["audio_token_id"] as? Int ?? 262273,
+            imageTokenId: configDict["image_token_id"] as? Int ?? 262145,
+            hiddenSize: configDict["hidden_size"] as? Int ?? 2048,
+            padTokenId: configDict["pad_token_id"] as? Int ?? 0,
+            visionSoftTokensPerImage: configDict["vision_soft_tokens_per_image"] as? Int ?? 256,
+            audioSoftTokensPerImage: configDict["audio_soft_tokens_per_image"] as? Int ?? 188,
+            eosTokenId: configDict["eos_token_id"] as? [Int]
+        )
+
+        let model = Gemma3n(modelConfig)
+
+        // Load weights
+        let weightFiles = try FileManager.default.contentsOfDirectory(atPath: path.path)
+            .filter { $0.hasSuffix(".safetensors") }
+
+        guard !weightFiles.isEmpty else {
+            throw NSError(
+                domain: "ModelLoading", code: 1,
+                userInfo: [NSLocalizedDescriptionKey: "No safetensors found"])
+        }
+
+        var weights = [String: MLXArray]()
+        for weightFile in weightFiles {
+            let weightPath = path.appendingPathComponent(weightFile)
+            let fileWeights = try loadArrays(url: weightPath)
+            weights.merge(fileWeights) { _, new in new }
+        }
+
+        var sanitizedWeights = model.sanitize(weights: weights)
+        sanitizedWeights = model.visionTower.sanitize(weights: sanitizedWeights)
+        try model.update(parameters: ModuleParameters.unflattened(sanitizedWeights), verify: [.all])
+
+        return model
+    }
+}
+
+// MARK: - Audio Model Components
+
+// MARK: - Helper Functions for Padding
+private func convertTorchToMLXPadWidth(_ padding: [Int], _ inputShape: [Int]) -> [IntOrPair] {
+    let ndim = inputShape.count
+    var padWidth = Array(repeating: IntOrPair((0, 0)), count: ndim)
+
+    if ndim >= 1 && padding.count >= 2 {
+        padWidth[ndim - 1] = IntOrPair((padding[0], padding[1]))
+    }
+    if ndim >= 2 && padding.count >= 4 {
+        padWidth[ndim - 2] = IntOrPair((padding[2], padding[3]))
+    }
+    if ndim >= 3 && padding.count >= 6 {
+        padWidth[ndim - 3] = IntOrPair((padding[4], padding[5]))
+    }
+    if ndim >= 4 && padding.count >= 8 {
+        padWidth[ndim - 4] = IntOrPair((padding[6], padding[7]))
+    }
+
+    return padWidth
+}
+
+// MARK: - Audio Relative Position Embedding
+private class Gemma3nAudioRelativePositionEmbedding: Module {
+    let config: AudioConfig
+    let numHeads: Int
+    let channels: Int
+    let headDim: Int
+    let maxBackward: Int
+    let maxForward: Int
+
+    @ModuleInfo var posProj: Linear
+    @ModuleInfo var invTimescales: MLXArray
+
+    init(config: AudioConfig) {
+        self.config = config
+        self.numHeads = config.confNumAttentionHeads
+        self.channels = config.hiddenSize
+        self.headDim = channels / numHeads
+        self.maxBackward =
+            config.confAttentionContextLeft > 0 ? config.confAttentionContextLeft - 1 : 0
+        self.maxForward = config.confAttentionContextRight
+
+        self._posProj.wrappedValue = Linear(channels, numHeads * headDim, bias: false)
+
+        let minTimescale: Float = 1.0
+        let maxTimescale: Float = 1.0e4
+        let numTimescales = channels / 2
+        let logTimescaleIncrement =
+            log(maxTimescale / minTimescale) / max(Float(numTimescales - 1), 1)
+        let invTimescales =
+            minTimescale
+            * exp(
+                MLXArray(0 ..< numTimescales).asType(.float32) * (-logTimescaleIncrement)
+            )
+
+        self._invTimescales.wrappedValue = expandedDimensions(
+            expandedDimensions(invTimescales, axis: 0),
+            axis: 0
+        )
+
+        super.init()
+    }
+
+    private func getTimingSignal1dPos(_ position: MLXArray, dtype: DType) -> MLXArray {
+        assert(position.ndim == 2)
+        let positionFloat = expandedDimensions(position.asType(.float32), axis: -1)
+
+        let scaledTime = positionFloat * invTimescales
+        let timingSignal = concatenated([sin(scaledTime), cos(scaledTime)], axis: -1)
+        return timingSignal.asType(dtype)
+    }
+
+    private func relativeShift(
+        _ termBdBeforeShift: MLXArray,
+        batchSize: Int,
+        numHeads: Int,
+        numQueryBlocks: Int,
+        queryBlockSize: Int,
+        keyContextSize: Int,
+        maxSpanPlus1: Int
+    ) -> MLXArray {
+        let padAmountLastDim = (keyContextSize + 1) - maxSpanPlus1
+        let paddingTuple = [0, padAmountLastDim]
+
+        let termBdPadded = padded(
+            termBdBeforeShift,
+            widths: convertTorchToMLXPadWidth(paddingTuple, Array(termBdBeforeShift.shape))
+        )
+
+        let termBdReshaped = termBdPadded.reshaped([
+            batchSize,
+            numHeads,
+            numQueryBlocks,
+            queryBlockSize * (keyContextSize + 1),
+        ])
+
+        let termBdSliced = termBdReshaped[0..., 0..., 0..., ..<(queryBlockSize * keyContextSize)]
+
+        let termBdShifted = termBdSliced.reshaped([
+            batchSize,
+            numHeads,
+            numQueryBlocks,
+            queryBlockSize,
+            keyContextSize,
+        ])
+
+        return termBdShifted
+    }
+
+    func callAsFunction(_ queries: MLXArray, _ keys: MLXArray) -> MLXArray {
+        let (batchSize, numQueryBlocks, queryBlockSize, numHeads, headDim) = (
+            queries.shape[0], queries.shape[1], queries.shape[2], queries.shape[3], queries.shape[4]
+        )
+        let keyContextSize = keys.shape[2]
+
+        // Relative positions for sinusoidal embeddings
+        let posIndices = expandedDimensions(
+            MLXArray(stride(from: maxBackward, through: -maxForward - 1, by: -1)),
+            axis: 0
+        )
+        let maxSpanPlus1 = posIndices.shape[1]
+
+        let sinEmbTimingSignal = getTimingSignal1dPos(posIndices, dtype: queries.dtype)
+
+        // Project sinusoidal embeddings
+        let projectedSinEmb = posProj(sinEmbTimingSignal)
+        let sinEmb = projectedSinEmb.reshaped([1, maxSpanPlus1, numHeads, headDim]).squeezed(
+            axis: 0)
+
+        // Term AC: Query-Key content interaction
+        let queriesP = queries.transposed(0, 3, 1, 2, 4)
+        let keysPT = keys.transposed(0, 3, 1, 4, 2)
+        let termAc = matmul(queriesP, keysPT)
+
+        // Term BD: Query-Position interaction
+        let qTransposed = queries.transposed(0, 3, 1, 2, 4)
+        let sTransposed = sinEmb.transposed(1, 2, 0)
+
+        let qReshaped = qTransposed.reshaped([
+            batchSize, numHeads, numQueryBlocks * queryBlockSize, headDim,
+        ])
+
+        let termBdUnshifedMatmul = matmul(qReshaped, sTransposed)
+
+        let termBdUnshifed = termBdUnshifedMatmul.reshaped([
+            batchSize,
+            numHeads,
+            numQueryBlocks,
+            queryBlockSize,
+            maxSpanPlus1,
+        ])
+
+        let termBdShifted = relativeShift(
+            termBdUnshifed,
+            batchSize: batchSize,
+            numHeads: numHeads,
+            numQueryBlocks: numQueryBlocks,
+            queryBlockSize: queryBlockSize,
+            keyContextSize: keyContextSize,
+            maxSpanPlus1: maxSpanPlus1
+        )
+
+        return termAc + termBdShifted
+    }
+}
+
+// MARK: - Cumulative Group Norm
+private class Gemma3nCumulativeGroupNorm: Module {
+    let numChannels: Int
+    let featureDims: [Int]
+    let eps: Float
+    let useScale: Bool
+    let useBias: Bool
+    let reductionAxes: [Int]
+
+    @ModuleInfo var weight: MLXArray?
+    @ModuleInfo var bias: MLXArray?
+
+    init(
+        numChannels: Int,
+        featureDims: [Int],
+        eps: Float = 1e-3,
+        useScale: Bool = true,
+        useBias: Bool = false
+    ) {
+        self.numChannels = numChannels
+        self.featureDims = featureDims
+        self.eps = eps
+        self.useScale = useScale
+        self.useBias = useBias
+
+        // Axes for normalization: all dimensions except Batch (0) and Time (1)
+        self.reductionAxes = Array(2 ..< (2 + featureDims.count + 1))
+
+        if useScale {
+            self._weight.wrappedValue = MLXArray.ones([numChannels])
+        } else {
+            self._weight.wrappedValue = nil
+        }
+
+        if useBias {
+            self._bias.wrappedValue = MLXArray.zeros([numChannels])
+        } else {
+            self._bias.wrappedValue = nil
+        }
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray, mask: MLXArray? = nil) -> MLXArray {
+        let expectedInputSuffix = featureDims + [numChannels]
+        assert(Array(x.shape.suffix(expectedInputSuffix.count)) == expectedInputSuffix)
+
+        if let mask = mask {
+            assert(mask.shape == Array(x.shape.prefix(2)))
+            assert(mask.dtype == .bool)
+        }
+
+        let inputDtype = x.dtype
+        let calcDtype = DType.float32
+        let xCalc = x.asType(calcDtype)
+
+        let maskCalc: MLXArray
+        if let mask = mask {
+            let maskSuffixShape = Array(repeating: 1, count: expectedInputSuffix.count)
+            maskCalc = mask.reshaped(Array(mask.shape) + maskSuffixShape).asType(calcDtype)
+        } else {
+            maskCalc = MLXArray.ones(like: xCalc).asType(calcDtype)
+        }
+
+        let xMaskedForSum = xCalc * maskCalc
+
+        // Cumulative Statistics Calculation
+        let sumValuesAtT = sum(xMaskedForSum, axes: reductionAxes, keepDims: true)
+        let cumSumValues = cumsum(sumValuesAtT, axis: 1)
+
+        let elementsInGroupAtT = sum(maskCalc, axes: reductionAxes, keepDims: true)
+        let cumCountElements = cumsum(elementsInGroupAtT, axis: 1)
+        let safeCumCountElements = clip(cumCountElements, min: MLXArray(1))
+
+        let cumMean = cumSumValues / safeCumCountElements
+
+        let squaredDiffFromMean = pow(xCalc - cumMean, 2)
+        let sumSqDiffAtT = sum(
+            squaredDiffFromMean * maskCalc,
+            axes: reductionAxes,
+            keepDims: true
+        )
+        let cumSumSqDiff = cumsum(sumSqDiffAtT, axis: 1)
+
+        let cumVariance = cumSumSqDiff / safeCumCountElements
+
+        var normalizedX = (xCalc - cumMean) * rsqrt(cumVariance + eps)
+
+        if useScale, let weight = weight {
+            let scale = weight.asType(calcDtype)
+            let scaleViewShape = Array(repeating: 1, count: x.ndim - 1) + [numChannels]
+            normalizedX = normalizedX * scale.reshaped(scaleViewShape)
+        }
+
+        if useBias, let bias = bias {
+            let biasValue = bias.asType(calcDtype)
+            let biasViewShape = Array(repeating: 1, count: x.ndim - 1) + [numChannels]
+            normalizedX = normalizedX + biasValue.reshaped(biasViewShape)
+        }
+
+        let finalOutput = normalizedX * maskCalc
+        return finalOutput.asType(inputDtype)
+    }
+}
+
+// MARK: - Audio SSCP Conv Block
+private class Gemma3nAudioSSCPConvBlock: Module {
+    let config: AudioConfig
+    let manualPadding: [Int]
+
+    @ModuleInfo var conv: Conv2d
+    @ModuleInfo var norm: Gemma3nCumulativeGroupNorm
+
+    init(
+        idx: Int,
+        inputFreqDim: Int,
+        config: AudioConfig,
+        manualPadding: [Int] = [0, 0, 0, 0]
+    ) {
+        self.config = config
+        self.manualPadding = manualPadding
+
+        let inChannels = idx == 0 ? 1 : config.sscpConvChannelSize[idx - 1]
+        let outChannels = config.sscpConvChannelSize[idx]
+        let (kernelH, kernelW) = (
+            config.sscpConvKernelSize[idx][0], config.sscpConvKernelSize[idx][1]
+        )
+        let (strideH, strideW) = (
+            config.sscpConvStrideSize[idx][0], config.sscpConvStrideSize[idx][1]
+        )
+
+        self._conv.wrappedValue = Conv2d(
+            inputChannels: inChannels,
+            outputChannels: outChannels,
+            kernelSize: IntOrPair((kernelH, kernelW)),
+            stride: IntOrPair((strideH, strideW)),
+            padding: IntOrPair((0, 0)),
+            bias: false
+        )
+
+        let fInPadded = inputFreqDim + manualPadding[0] + manualPadding[1]
+        let fOutConv = (fInPadded - kernelW) / strideW + 1
+
+        self._norm.wrappedValue = Gemma3nCumulativeGroupNorm(
+            numChannels: outChannels,
+            featureDims: [fOutConv],
+            eps: config.sscpConvEps,
+            useScale: true,
+            useBias: false
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let audioencodingsPadded = padded(
+            x,
+            widths: convertTorchToMLXPadWidth(manualPadding, Array(x.shape))
+        )
+
+        let audioencodingsConv = conv(audioencodingsPadded.transposed(0, 2, 3, 1))
+        let xNormed = norm(audioencodingsConv)
+        let audioencodingsNormed = xNormed.transposed(0, 3, 1, 2)
+        return relu(audioencodingsNormed)
+    }
+}
+
+// MARK: - Audio Subsample Conv Projection
+private class Gemma3nAudioSubSampleConvProjection: Module {
+    let config: AudioConfig
+    let inputProjInFeatures: Int
+
+    @ModuleInfo var conv0: Gemma3nAudioSSCPConvBlock
+    @ModuleInfo var conv1: Gemma3nAudioSSCPConvBlock
+    @ModuleInfo var inputProjLinear: Linear
+
+    init(config: AudioConfig) {
+        self.config = config
+
+        var currentFForBlockInput = config.inputFeatSize
+        var calculatedBlockPadding: [[Int]] = []
+        var calculatedFOutDims: [Int] = []
+
+        for i in 0 ..< 2 {
+            let (kernelH, kernelW) = (
+                config.sscpConvKernelSize[i][0], config.sscpConvKernelSize[i][1]
+            )
+            let (strideH, strideW) = (
+                config.sscpConvStrideSize[i][0], config.sscpConvStrideSize[i][1]
+            )
+
+            let padTTop = 0
+            let padTBottom = kernelH - 1
+            let padFLeft = 1
+            let padFRight = 1
+
+            let manualPaddingTuple = [padFLeft, padFRight, padTTop, padTBottom]
+            calculatedBlockPadding.append(manualPaddingTuple)
+
+            let fInPadded = currentFForBlockInput + padFLeft + padFRight
+            let fOutAfterConv = (fInPadded - kernelW) / strideW + 1
+            calculatedFOutDims.append(fOutAfterConv)
+            currentFForBlockInput = fOutAfterConv
+        }
+
+        self._conv0.wrappedValue = Gemma3nAudioSSCPConvBlock(
+            idx: 0,
+            inputFreqDim: config.inputFeatSize,
+            config: config,
+            manualPadding: calculatedBlockPadding[0]
+        )
+
+        self._conv1.wrappedValue = Gemma3nAudioSSCPConvBlock(
+            idx: 1,
+            inputFreqDim: calculatedFOutDims[0],
+            config: config,
+            manualPadding: calculatedBlockPadding[1]
+        )
+
+        let finalCOut = config.sscpConvChannelSize.last!
+        let finalFOut = calculatedFOutDims.last!
+        self.inputProjInFeatures = finalCOut * finalFOut
+
+        self._inputProjLinear.wrappedValue = Linear(
+            inputProjInFeatures,
+            config.hiddenSize,
+            bias: false
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        // audio_encodings is [B, T, F_in]
+        // Reshape to [B, 1, T, F_in]
+        let audioencodingsReshaped = expandedDimensions(x, axis: 1)
+        var result = conv0(audioencodingsReshaped)
+        result = conv1(result)
+
+        let (b, cOut, tOut, fOut) = (
+            result.shape[0], result.shape[1], result.shape[2], result.shape[3]
+        )
+        let xTransposed = result.transposed(0, 2, 3, 1)
+        let outputFlattened = xTransposed.reshaped([b, tOut, fOut * cOut])
+        let output = inputProjLinear(outputFlattened)
+        return output
+    }
+}
+
+// MARK: - Audio Attention
+private class Gemma3nAudioAttention: Module {
+    let config: AudioConfig
+    let numHeads: Int
+    let hiddenSize: Int
+    let headDim: Int
+    let chunkSize: Int
+    let maxFutureHorizon: Int
+    let maxPastHorizon: Int
+    let attentionInvalidLogitsValue: Float
+    let attentionLogitsSoftCap: Float
+    let contextSize: Int
+    let qScale: Float
+    let localCausalValidMask: MLXArray
+    let softcap: MLXArray
+
+    @ModuleInfo var relativePositionEmbedding: Gemma3nAudioRelativePositionEmbedding
+    @ModuleInfo var perDimScale: MLXArray
+    @ModuleInfo var qProj: Linear
+    @ModuleInfo var kProj: Linear
+    @ModuleInfo var vProj: Linear
+
+    init(config: AudioConfig) {
+        self.config = config
+        self.numHeads = config.confNumAttentionHeads
+        self.hiddenSize = config.hiddenSize
+        self.headDim = hiddenSize / numHeads
+        self.chunkSize = config.confAttentionChunkSize
+        self.maxFutureHorizon = config.confAttentionContextRight
+        self.maxPastHorizon = max(0, config.confAttentionContextLeft - 1)
+        self.attentionInvalidLogitsValue = config.confAttentionInvalidLogitsValue
+        self.attentionLogitsSoftCap = config.confAttentionLogitCap
+        self.contextSize = chunkSize + maxPastHorizon + maxFutureHorizon
+
+        self._relativePositionEmbedding.wrappedValue = Gemma3nAudioRelativePositionEmbedding(
+            config: config)
+        self._perDimScale.wrappedValue = MLXArray.zeros([headDim])
+
+        self._qProj.wrappedValue = Linear(hiddenSize, numHeads * headDim, bias: false)
+        self._kProj.wrappedValue = Linear(hiddenSize, numHeads * headDim, bias: false)
+        self._vProj.wrappedValue = Linear(hiddenSize, numHeads * headDim, bias: false)
+
+        let qScale = pow(Float(headDim), -0.5)
+        let rSoftplus0 = 1.0 / log(2.0)
+        self.qScale = qScale * Float(rSoftplus0)
+
+        let lowerCausalMask = tril(
+            MLXArray.ones([contextSize, chunkSize], dtype: .bool),
+            k: 0
+        ).transposed()
+
+        let upperCausalMask = tril(
+            MLXArray.ones([chunkSize, contextSize], dtype: .bool),
+            k: maxPastHorizon + maxFutureHorizon
+        )
+
+        let localCausalValidMaskTemp = MLXArray.ones([chunkSize, contextSize], dtype: .bool)
+        self.localCausalValidMask = localCausalValidMaskTemp .&& lowerCausalMask .&& upperCausalMask
+
+        self.softcap = MLXArray(attentionLogitsSoftCap, dtype: .float32)
+
+        super.init()
+    }
+
+    private func padDim1(_ x: MLXArray, dim10Val: Int, dim11Val: Int) -> MLXArray {
+        var paddingTuple = Array(repeating: 0, count: x.ndim * 2)
+        let dimIdxFromEnd = x.ndim - 2
+        let startIdxForDim = 2 * dimIdxFromEnd
+        paddingTuple[startIdxForDim] = dim10Val
+        paddingTuple[startIdxForDim + 1] = dim11Val
+
+        return padded(
+            x,
+            widths: convertTorchToMLXPadWidth(paddingTuple, Array(x.shape))
+        )
+    }
+
+    private func convertToBlock(_ x: MLXArray, paddingVal: Float = 0.0) -> MLXArray {
+        let shape = x.shape
+        let (b, t) = (shape[0], shape[1])
+        let numBlocks = (t + chunkSize - 1) / chunkSize
+
+        let paddingLen = numBlocks * chunkSize - t
+        let paddedX = paddingLen > 0 ? padDim1(x, dim10Val: 0, dim11Val: paddingLen) : x
+
+        let permutedims = [b, numBlocks, chunkSize] + Array(shape.dropFirst(2))
+        return paddedX.reshaped(permutedims)
+    }
+
+    private func unfoldMLX(_ x: MLXArray, dimension: Int, size: Int, step: Int) -> MLXArray {
+        let shape = x.shape
+        let dimSize = shape[dimension]
+        let numWindows = (dimSize - size) / step + 1
+
+        var windows: [MLXArray] = []
+        for i in 0 ..< numWindows {
+            let startIdx = i * step
+            let endIdx = startIdx + size
+
+            var slices: [any MLXArrayIndex] = Array(repeating: .ellipsis, count: shape.count)
+            slices[dimension] = startIdx ..< endIdx
+
+            windows.append(x[slices])
+        }
+
+        return stacked(windows, axis: dimension + 1)
+    }
+
+    private func extractBlockContext(_ x: MLXArray) -> MLXArray {
+        let padLeft = maxPastHorizon
+        let padRight = maxFutureHorizon + chunkSize - 1
+        let paddedX = padDim1(x, dim10Val: padLeft, dim11Val: padRight)
+
+        let frameLen = contextSize
+        let frameStep = chunkSize
+
+        let xUnfolded = unfoldMLX(paddedX, dimension: 1, size: frameLen, step: frameStep)
+
+        if x.ndim > 2 && xUnfolded.ndim > 3 {
+            return xUnfolded.transposed(0, 2, 1, 3, 4)
+        }
+
+        return xUnfolded
+    }
+
+    func callAsFunction(_ x: MLXArray, mask: MLXArray) -> MLXArray {
+        let queryStates = qProj(x).reshaped(
+            Array(x.shape.dropLast()) + [numHeads, headDim]
+        )
+        let keyStates = kProj(x).reshaped(
+            Array(x.shape.dropLast()) + [numHeads, headDim]
+        )
+        let valueStates = vProj(x).reshaped(
+            Array(x.shape.dropLast()) + [numHeads, headDim]
+        )
+
+        let perDimScaleSp = logAddExp(perDimScale, MLXArray(0.0))
+        let broadcastShape = [1, 1, 1, headDim]
+        let perDimScaleSpBroadcast = perDimScaleSp.reshaped(broadcastShape)
+        let scaledQueryStates = queryStates * qScale * perDimScaleSpBroadcast
+
+        let (batchSize, qTime) = (scaledQueryStates.shape[0], scaledQueryStates.shape[1])
+
+        let queryBlocks = convertToBlock(scaledQueryStates)
+        let keyBlocks = extractBlockContext(keyStates)
+        let valueBlocks = extractBlockContext(valueStates)
+        let numQueryBlocks = queryBlocks.shape[1]
+
+        // Create validity mask
+        let originalValidMask = .!mask
+        let extractedValidMaskBlocks = extractBlockContext(originalValidMask).transposed(0, 2, 1)
+
+        let conditionFromInputValidity = expandedDimensions(
+            expandedDimensions(extractedValidMaskBlocks, axis: 1),
+            axis: -2
+        )
+
+        let conditionFromCausality = expandedDimensions(
+            expandedDimensions(
+                expandedDimensions(localCausalValidMask, axis: 0),
+                axis: 0
+            ),
+            axis: 0
+        )
+
+        let finalConditionForWhere = conditionFromInputValidity .&& conditionFromCausality
+
+        var logits = relativePositionEmbedding(queryBlocks, keyBlocks)
+
+        // Apply attention logit softcap
+        logits = logits / softcap
+        logits = tanh(logits)
+        logits = logits * softcap
+
+        // Apply the combined mask
+        logits = MLX.where(
+            finalConditionForWhere,
+            logits,
+            MLXArray(attentionInvalidLogitsValue)
+        )
+
+        let probabilities = softmax(logits.asType(.float32), axis: -1).asType(valueBlocks.dtype)
+
+        // Compute context vectors
+        let (bDim, nDim, uDim, wDim, cDim) = (
+            probabilities.shape[0], probabilities.shape[1], probabilities.shape[2],
+            probabilities.shape[3], probabilities.shape[4]
+        )
+        let hDim = valueBlocks.shape.last!
+
+        let probBun = probabilities.transposed(0, 2, 1, 3, 4).reshaped([-1, wDim, cDim])
+        let vBun = valueBlocks.transposed(0, 1, 3, 2, 4).reshaped([-1, cDim, hDim])
+        let resultBmm = matmul(probBun, vBun)
+
+        var contextVectors = resultBmm.reshaped([bDim, uDim, nDim, wDim, hDim]).transposed(
+            0, 1, 3, 2, 4)
+        contextVectors = contextVectors.reshaped([
+            batchSize,
+            numQueryBlocks * chunkSize,
+            numHeads,
+            headDim,
+        ])
+
+        contextVectors = contextVectors[0..., ..<qTime, 0..., 0...]
+        return contextVectors
+    }
+}
+
+// MARK: - Conformer Attention
+private class Gemma3nAudioConformerAttention: Module {
+    let config: AudioConfig
+    let postInFeatures: Int
+    let gradientClipping: MLXArray
+
+    @ModuleInfo var preAttnNorm: Gemma3nRMSNorm
+    @ModuleInfo var attn: Gemma3nAudioAttention
+    @ModuleInfo var post: Linear
+    @ModuleInfo var postNorm: Gemma3nRMSNorm
+
+    init(config: AudioConfig) {
+        self.config = config
+        let headDim = config.hiddenSize / config.confNumAttentionHeads
+        self.postInFeatures = config.hiddenSize
+        self.gradientClipping = MLXArray(config.gradientClipping)
+
+        self._preAttnNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._attn.wrappedValue = Gemma3nAudioAttention(config: config)
+        self._post.wrappedValue = Linear(postInFeatures, config.hiddenSize, bias: false)
+        self._postNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray, mask: MLXArray) -> MLXArray {
+        let audioencodingsInputToAttn = x
+        let clippedX = clip(x, min: -gradientClipping, max: gradientClipping)
+        let audioencodingsNorm = preAttnNorm(clippedX)
+        let audioencodingsAttnOut = attn(audioencodingsNorm, mask: mask)
+
+        let (b, t, numHeads, headDim) = (
+            audioencodingsAttnOut.shape[0], audioencodingsAttnOut.shape[1],
+            audioencodingsAttnOut.shape[2], audioencodingsAttnOut.shape[3]
+        )
+        let audioencodingsReshaped = audioencodingsAttnOut.reshaped([b, t, numHeads * headDim])
+
+        let postResult = post(audioencodingsReshaped)
+        let clippedPost = clip(postResult, min: -gradientClipping, max: gradientClipping)
+        return audioencodingsInputToAttn + postNorm(clippedPost)
+    }
+}
+
+// MARK: - Conformer Feed Forward
+private class Gemma3nAudioConformerFeedForward: Module {
+    let config: AudioConfig
+    let gradientClipping: MLXArray
+    let postLayerScale: MLXArray
+
+    @ModuleInfo var preLayerNorm: Gemma3nRMSNorm
+    @ModuleInfo var ffwLayer1: Linear
+    @ModuleInfo var ffwLayer2: Linear
+    @ModuleInfo var postLayerNorm: Gemma3nRMSNorm
+
+    init(config: AudioConfig) {
+        self.config = config
+        self.gradientClipping = MLXArray(config.gradientClipping)
+        self.postLayerScale = MLXArray(config.confResidualWeight)
+
+        self._preLayerNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._ffwLayer1.wrappedValue = Linear(config.hiddenSize, config.hiddenSize * 4, bias: false)
+        self._ffwLayer2.wrappedValue = Linear(config.hiddenSize * 4, config.hiddenSize, bias: false)
+        self._postLayerNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let residual = x
+        let clippedX = clip(x, min: -gradientClipping, max: gradientClipping)
+        var result = preLayerNorm(clippedX)
+        result = ffwLayer1(result)
+        result = silu(result)
+        result = ffwLayer2(result)
+        let clippedResult = clip(result, min: -gradientClipping, max: gradientClipping)
+        let normedResult = postLayerNorm(clippedResult)
+        return residual + (normedResult * postLayerScale)
+    }
+}
+
+// MARK: - Conformer Light Conv1D
+private class Gemma3nAudioConformerLightConv1d: Module {
+    let config: AudioConfig
+    let gradientClipping: MLXArray
+    let causalPadding: Int
+
+    @ModuleInfo var preLayerNorm: Gemma3nRMSNorm
+    @ModuleInfo var linearStart: Linear
+    @ModuleInfo var depthwiseConv1d: Conv1d
+    @ModuleInfo var convNorm: Gemma3nRMSNorm
+    @ModuleInfo var linearEnd: Linear
+
+    init(config: AudioConfig) {
+        self.config = config
+        self.gradientClipping = MLXArray(config.gradientClipping)
+        self.causalPadding = config.confConvKernelSize - 1
+
+        self._preLayerNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.hiddenSize,
+            eps: config.rmsNormEps
+        )
+        self._linearStart.wrappedValue = Linear(
+            config.hiddenSize,
+            config.hiddenSize * 2,
+            bias: false
+        )
+        self._depthwiseConv1d.wrappedValue = Conv1d(
+            inputChannels: config.hiddenSize,
+            outputChannels: config.hiddenSize,
+            kernelSize: config.confConvKernelSize,
+            stride: 1,
+            padding: 0,
+            groups: config.hiddenSize,
+            bias: false
+        )
+        self._convNorm.wrappedValue = Gemma3nRMSNorm(
+            dim: config.hiddenSize,
+            eps: config.rmsNormEps
+        )
+        self._linearEnd.wrappedValue = Linear(config.hiddenSize, config.hiddenSize, bias: false)
+
+        super.init()
+    }
+
+    func callAsFunction(_ audioencodings: MLXArray) -> MLXArray {
+        let audioencodingsResidual = audioencodings
+
+        var result = preLayerNorm(audioencodings)
+        result = linearStart(result)
+        result = glu(result, axis: -1)
+
+        // Apply manual causal padding and conv1d
+        let audioencodingsTransposed = result.transposed(0, 2, 1)
+        let paddedAudio = padded(
+            audioencodingsTransposed,
+            widths: convertTorchToMLXPadWidth(
+                [causalPadding, 0], Array(audioencodingsTransposed.shape))
+        )
+
+        result = depthwiseConv1d(paddedAudio.transposed(0, 2, 1))
+        result = clip(result, min: -gradientClipping, max: gradientClipping)
+        result = convNorm(result)
+        result = silu(result)
+        result = linearEnd(result)
+
+        return result + audioencodingsResidual
+    }
+}
+
+// MARK: - Conformer Block
+private class Gemma3nAudioConformerBlock: Module {
+    let config: AudioConfig
+    let gradientClipping: MLXArray
+
+    @ModuleInfo var ffwLayerStart: Gemma3nAudioConformerFeedForward
+    @ModuleInfo var attention: Gemma3nAudioConformerAttention
+    @ModuleInfo var lconv1d: Gemma3nAudioConformerLightConv1d
+    @ModuleInfo var ffwLayerEnd: Gemma3nAudioConformerFeedForward
+    @ModuleInfo var norm: Gemma3nRMSNorm
+
+    init(config: AudioConfig) {
+        self.config = config
+        self.gradientClipping = MLXArray(config.gradientClipping)
+
+        self._ffwLayerStart.wrappedValue = Gemma3nAudioConformerFeedForward(config: config)
+        self._attention.wrappedValue = Gemma3nAudioConformerAttention(config: config)
+        self._lconv1d.wrappedValue = Gemma3nAudioConformerLightConv1d(config: config)
+        self._ffwLayerEnd.wrappedValue = Gemma3nAudioConformerFeedForward(config: config)
+        self._norm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+
+        super.init()
+    }
+
+    func callAsFunction(_ audioencodings: MLXArray, _ audioMelMask: MLXArray) -> MLXArray {
+        var result = ffwLayerStart(audioencodings)
+        result = attention(result, mask: audioMelMask)
+
+        let validityMaskForLconv = .!audioMelMask
+        let audioencodingsForLconvInput =
+            result
+            * expandedDimensions(
+                validityMaskForLconv, axis: -1
+            ).asType(result.dtype)
+
+        result = lconv1d(audioencodingsForLconvInput)
+        result = ffwLayerEnd(result)
+        result = clip(result, min: -gradientClipping, max: gradientClipping)
+        return norm(result)
+    }
+}
+
+// MARK: - MobileNetV5 Architecture Components
+
+// MARK: - Layer Scale 2D
+private class LayerScale2d: Module, UnaryLayer {
+    let inplace: Bool
+    @ModuleInfo var gamma: MLXArray
+
+    init(dim: Int, initValues: Float = 1e-5, inplace: Bool = false) {
+        self.inplace = inplace
+        self._gamma.wrappedValue = MLXArray(initValues) * MLXArray.ones([dim])
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        if inplace {
+            return x * gamma
+        } else {
+            return x * gamma
+        }
+    }
+}
+
+// MARK: - RMS Norm 2D for Vision
+private func rmsNorm2d(
+    _ x: MLXArray,
+    normalizedShape: [Int],
+    weight: MLXArray? = nil,
+    eps: Float = 1e-5
+) -> MLXArray {
+    assert(normalizedShape.count == 1)
+    let dtype = x.dtype
+    let v = pow(x, 2)
+    let vMean = mean(v, axis: 1, keepDims: true)
+    var result = x * rsqrt(vMean + eps)
+
+    if let weight = weight {
+        let weightReshaped = weight.reshaped([1, -1, 1, 1])
+        result = result.asType(dtype) * weightReshaped
+    }
+    return result
+}
+
+private class RMSNormAct2d: Module, UnaryLayer {
+    let normalizedShape: [Int]
+    let eps: Float
+    let applyAct: Bool
+    @ModuleInfo var weight: MLXArray
+    @ModuleInfo var drop: Identity
+    @ModuleInfo var act: UnaryLayer
+
+    init(numChannels: Int, eps: Float = 1e-6, applyAct: Bool = true) {
+        self.normalizedShape = [numChannels]
+        self.eps = eps
+        self.applyAct = applyAct
+
+        self._weight.wrappedValue = MLXArray.ones([numChannels])
+        self._drop.wrappedValue = Identity()
+        self._act.wrappedValue = applyAct ? GELU() : Identity()
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        // Convert from NHWC to NCHW for RMS norm
+        let xNCHW = x.transposed(0, 3, 1, 2)
+        var result = rmsNorm2d(xNCHW, normalizedShape: normalizedShape, weight: weight, eps: eps)
+        result = drop(result)
+        result = act(result)
+        // Convert back to NHWC
+        return result.transposed(0, 2, 3, 1)
+    }
+}
+
+// MARK: - Helper Functions
+private func numGroups(groupSize: Int?, channels: Int) -> Int {
+    guard let groupSize = groupSize, groupSize > 0 else {
+        return 1  // normal conv with 1 group
+    }
+    // NOTE: groupSize == 1 -> depthwise conv
+    assert(channels % groupSize == 0)
+    return channels / groupSize
+}
+
+private func makeDivisible(
+    _ v: Int, divisor: Int = 8, minValue: Int? = nil, roundLimit: Float = 0.9
+) -> Int {
+    let minVal = minValue ?? divisor
+    let newV = max(minVal, (v + divisor / 2) / divisor * divisor)
+    // Make sure that round down does not go down by more than 10%
+    if Float(newV) < roundLimit * Float(v) {
+        return newV + divisor
+    }
+    return newV
+}
+
+private func to2Tuple(_ x: Any) -> (Int, Int) {
+    if let tuple = x as? (Int, Int) {
+        return tuple
+    } else if let int = x as? Int {
+        return (int, int)
+    } else {
+        fatalError("Cannot convert to 2-tuple")
+    }
+}
+
+// MARK: - Conv Norm Act
+private class ConvNormAct: Module, UnaryLayer {
+    let outChannels: Int
+    @ModuleInfo var conv: Conv2d
+    @ModuleInfo var bn: RMSNormAct2d
+
+    init(
+        inChannels: Int,
+        outChannels: Int,
+        kernelSize: Int = 3,
+        stride: Int = 1,
+        padding: Int = 0,
+        dilation: Int = 1,
+        groups: Int = 1,
+        bias: Bool = false,
+        applyAct: Bool = true,
+        eps: Float = 1e-6
+    ) {
+        self.outChannels = outChannels
+
+        self._conv.wrappedValue = Conv2d(
+            inputChannels: inChannels,
+            outputChannels: outChannels,
+            kernelSize: IntOrPair(kernelSize),
+            stride: IntOrPair(stride),
+            padding: IntOrPair(padding),
+            dilation: IntOrPair(dilation),
+            groups: groups,
+            bias: bias
+        )
+
+        self._bn.wrappedValue = RMSNormAct2d(
+            numChannels: outChannels,
+            eps: eps,
+            applyAct: applyAct
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let c = conv(x)
+        let r = bn(c)
+        return r
+    }
+}
+
+// MARK: - Universal Inverted Residual
+private class UniversalInvertedResidual: Module, UnaryLayer {
+    let hasSkip: Bool
+    @ModuleInfo var dwStart: UnaryLayer
+    @ModuleInfo var pwExp: ConvNormAct
+    @ModuleInfo var dwMid: UnaryLayer
+    @ModuleInfo var pwProj: ConvNormAct
+    @ModuleInfo var layerScale: UnaryLayer
+
+    init(
+        inChannels: Int,
+        outChannels: Int,
+        dwKernelSizeStart: Int = 0,
+        dwKernelSizeMid: Int = 3,
+        dwKernelSizeEnd: Int = 0,
+        stride: Int = 1,
+        dilation: Int = 1,
+        groupSize: Int = 1,
+        padType: String = "",
+        noskip: Bool = false,
+        expRatio: Float = 1.0,
+        convKwargs: [String: Any]? = nil,
+        dropPathRate: Float = 0.0,
+        layerScaleInitValue: Float? = 1e-5
+    ) {
+        self.hasSkip = (inChannels == outChannels && stride == 1) && !noskip
+
+        if stride > 1 {
+            assert(dwKernelSizeStart > 0 || dwKernelSizeMid > 0 || dwKernelSizeEnd > 0)
+        }
+
+        // DW Start
+        if dwKernelSizeStart > 0 {
+            let dwStartStride = dwKernelSizeMid > 0 ? 1 : stride
+            let dwStartGroups = numGroups(groupSize: groupSize, channels: inChannels)
+            self._dwStart.wrappedValue = ConvNormAct(
+                inChannels: inChannels,
+                outChannels: inChannels,
+                kernelSize: dwKernelSizeStart,
+                stride: dwStartStride,
+                padding: (dwKernelSizeStart - 1) / 2,
+                dilation: dilation,
+                groups: dwStartGroups,
+                bias: false,
+                applyAct: false,
+                eps: 1e-05
+            )
+        } else {
+            self._dwStart.wrappedValue = Identity()
+        }
+
+        // PW Expansion
+        let midChannels = makeDivisible(Int(Float(inChannels) * expRatio))
+        self._pwExp.wrappedValue = ConvNormAct(
+            inChannels: inChannels,
+            outChannels: midChannels,
+            kernelSize: 1,
+            stride: 1,
+            padding: 0,
+            groups: 1,
+            bias: false,
+            eps: 1e-05
+        )
+
+        // DW Mid
+        if dwKernelSizeMid > 0 {
+            let dwMidGroups = numGroups(groupSize: groupSize, channels: midChannels)
+            self._dwMid.wrappedValue = ConvNormAct(
+                inChannels: midChannels,
+                outChannels: midChannels,
+                kernelSize: dwKernelSizeMid,
+                stride: stride,
+                padding: (dwKernelSizeMid - 1) / 2,
+                dilation: dilation,
+                groups: dwMidGroups,
+                bias: false,
+                eps: 1e-05
+            )
+        } else {
+            self._dwMid.wrappedValue = Identity()
+        }
+
+        // PW Projection
+        self._pwProj.wrappedValue = ConvNormAct(
+            inChannels: midChannels,
+            outChannels: outChannels,
+            kernelSize: 1,
+            stride: 1,
+            padding: 0,
+            groups: 1,
+            bias: false,
+            applyAct: false,
+            eps: 1e-05
+        )
+
+        // Layer Scale
+        if let layerScaleInitValue = layerScaleInitValue {
+            self._layerScale.wrappedValue = LayerScale2d(
+                dim: outChannels, initValues: layerScaleInitValue)
+        } else {
+            self._layerScale.wrappedValue = Identity()
+        }
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let shortcut = x
+        var result = dwStart(x)
+        result = pwExp(result)
+        result = dwMid(result)
+        result = pwProj(result)
+        result = layerScale(result)
+
+        if hasSkip {
+            result = result + shortcut
+        }
+        return result
+    }
+}
+
+// MARK: - Edge Residual
+private class EdgeResidual: Module, UnaryLayer {
+    let hasSkip: Bool
+    @ModuleInfo var convExp: Conv2d
+    @ModuleInfo var bn1: RMSNormAct2d
+    @ModuleInfo var convPwl: Conv2d
+    @ModuleInfo var bn2: RMSNormAct2d
+
+    init(
+        inChannels: Int,
+        outChannels: Int,
+        expKernelSize: Int = 3,
+        stride: Int = 1,
+        dilation: Int = 1,
+        groupSize: Int = 0,
+        padType: String = "",
+        forceInChannels: Int = 0,
+        noskip: Bool = false,
+        expandRatio: Float = 1.0,
+        pwKernelSize: Int = 1,
+        normLayer: RMSNormAct2d.Type = RMSNormAct2d.self
+    ) {
+        let midChannels: Int
+        if forceInChannels > 0 {
+            midChannels = makeDivisible(Int(Float(forceInChannels) * expandRatio))
+        } else {
+            midChannels = makeDivisible(Int(Float(inChannels) * expandRatio))
+        }
+
+        let groups = numGroups(groupSize: groupSize, channels: midChannels)
+        self.hasSkip = (inChannels == outChannels && stride == 1) && !noskip
+
+        let padding = (expKernelSize - 1) / 2
+        self._convExp.wrappedValue = Conv2d(
+            inputChannels: inChannels,
+            outputChannels: midChannels,
+            kernelSize: IntOrPair(expKernelSize),
+            stride: IntOrPair(stride),
+            padding: IntOrPair(padding),
+            dilation: IntOrPair(dilation),
+            groups: groups,
+            bias: false
+        )
+
+        self._bn1.wrappedValue = RMSNormAct2d(numChannels: midChannels, eps: 1e-05)
+
+        // Point-wise linear projection
+        let paddingPwl = (pwKernelSize - 1) / 2
+        self._convPwl.wrappedValue = Conv2d(
+            inputChannels: midChannels,
+            outputChannels: outChannels,
+            kernelSize: IntOrPair(pwKernelSize),
+            stride: IntOrPair(1),
+            padding: IntOrPair(paddingPwl),
+            bias: false
+        )
+
+        self._bn2.wrappedValue = RMSNormAct2d(
+            numChannels: outChannels,
+            eps: 1e-05,
+            applyAct: false
+        )
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let shortcut = x
+        var result = convExp(x)
+        result = bn1(result)
+        result = convPwl(result)
+        result = bn2(result)
+
+        if hasSkip {
+            result = result + shortcut
+        }
+        return result
+    }
+}
+
+// MARK: - Multi-Query Attention 2D
+private class MultiQueryAttention2d: Module {
+    let numHeads: Int
+    let queryStrides: (Int, Int)
+    let kvStride: Int
+    let fusedAttn: Bool
+    let keyDim: Int
+    let valueDim: Int
+    let scale: Float
+
+    @ModuleInfo var queryProj: Conv2d
+    @ModuleInfo var keyDownConv: Conv2d?
+    @ModuleInfo var keyNorm: RMSNormAct2d?
+    @ModuleInfo var keyProj: Conv2d
+    @ModuleInfo var valueDownConv: Conv2d?
+    @ModuleInfo var valueNorm: RMSNormAct2d?
+    @ModuleInfo var valueProj: Conv2d
+    @ModuleInfo var attnDrop: UnaryLayer
+    @ModuleInfo var outputProj: Conv2d
+    @ModuleInfo var projDrop: UnaryLayer
+
+    init(
+        dim: Int,
+        dimOut: Int? = nil,
+        numHeads: Int = 8,
+        keyDim: Int = 64,
+        valueDim: Int = 64,
+        queryStrides: (Int, Int) = (1, 1),
+        kvStride: Int = 1,
+        dilation: Int = 1,
+        padding: String = "",
+        dwKernelSize: Int = 3,
+        attnDrop: Float = 0.0,
+        projDrop: Float = 0.0
+    ) {
+        let dimOut = dimOut ?? dim
+        self.numHeads = numHeads
+        self.queryStrides = queryStrides
+        self.kvStride = kvStride
+        self.fusedAttn = true
+        self.keyDim = keyDim
+        self.valueDim = valueDim
+        let headDim = keyDim
+        self.scale = pow(Float(headDim), -0.5)
+
+        // Query
+        self._queryProj.wrappedValue = Conv2d(
+            inputChannels: dim,
+            outputChannels: numHeads * keyDim,
+            kernelSize: IntOrPair(1)
+        )
+
+        // Key
+        if kvStride > 1 {
+            self._keyDownConv.wrappedValue = Conv2d(
+                inputChannels: dim,
+                outputChannels: dim,
+                kernelSize: IntOrPair(dwKernelSize),
+                stride: IntOrPair(kvStride),
+                padding: IntOrPair((dwKernelSize - 1) / 2),
+                dilation: IntOrPair(dilation),
+                groups: dim,
+                bias: false
+            )
+            self._keyNorm.wrappedValue = RMSNormAct2d(numChannels: dim, eps: 1e-6, applyAct: false)
+        } else {
+            self._keyDownConv.wrappedValue = nil
+            self._keyNorm.wrappedValue = nil
+        }
+        self._keyProj.wrappedValue = Conv2d(
+            inputChannels: dim,
+            outputChannels: keyDim,
+            kernelSize: IntOrPair(1),
+            bias: false
+        )
+
+        // Value
+        if kvStride > 1 {
+            self._valueDownConv.wrappedValue = Conv2d(
+                inputChannels: dim,
+                outputChannels: dim,
+                kernelSize: IntOrPair(dwKernelSize),
+                stride: IntOrPair(kvStride),
+                padding: IntOrPair((dwKernelSize - 1) / 2),
+                dilation: IntOrPair(dilation),
+                groups: dim,
+                bias: false
+            )
+            self._valueNorm.wrappedValue = RMSNormAct2d(
+                numChannels: dim, eps: 1e-6, applyAct: false)
+        } else {
+            self._valueDownConv.wrappedValue = nil
+            self._valueNorm.wrappedValue = nil
+        }
+        self._valueProj.wrappedValue = Conv2d(
+            inputChannels: dim,
+            outputChannels: valueDim,
+            kernelSize: IntOrPair(1),
+            bias: false
+        )
+
+        // Attention dropout
+        self._attnDrop.wrappedValue = attnDrop > 0 ? Dropout(p: attnDrop) : Identity()
+
+        // Output projection
+        self._outputProj.wrappedValue = Conv2d(
+            inputChannels: valueDim * numHeads,
+            outputChannels: dimOut,
+            kernelSize: IntOrPair(1),
+            stride: IntOrPair(1),
+            bias: false
+        )
+
+        self._projDrop.wrappedValue = projDrop > 0 ? Dropout(p: projDrop) : Identity()
+
+        super.init()
+    }
+
+    private func reshapeInput(_ t: MLXArray) -> MLXArray {
+        // Input shape MLX: [B, H, W, C]
+        // MLX Reshape: [B, H, W, C] -> [B, -1, C] -> [B, 1, -1, C] -> SDPA
+        let s = t.shape
+        let reshaped = t.reshaped([s[0], -1, s[3]])
+        return expandedDimensions(reshaped, axis: 1)
+    }
+
+    private func reshapeProjectedQuery(_ t: MLXArray, numHeads: Int, keyDim: Int) -> MLXArray {
+        // Input shape MLX: [B, H, W, C] where C = numHeads * keyDim
+        let (B, H, W, C) = (t.shape[0], t.shape[1], t.shape[2], t.shape[3])
+        let reshaped = t.reshaped([B, H * W, numHeads, keyDim])
+        return reshaped.transposed(0, 2, 1, 3)
+    }
+
+    private func reshapeOutput(_ t: MLXArray, numHeads: Int, hPx: Int, wPx: Int) -> MLXArray {
+        // Input shape: [B, NH, L, D] where L = hPx * wPx
+        // Output shape MLX: [B, H, W, C] where C = NH * D
+        let (B, NH, L, D) = (t.shape[0], t.shape[1], t.shape[2], t.shape[3])
+        // First transpose to [B, L, NH, D]
+        let transposed = t.transposed(0, 2, 1, 3)
+        // Then reshape to [B, H, W, NH*D]
+        return transposed.reshaped([B, hPx, wPx, NH * D])
+    }
+
+    func callAsFunction(_ x: MLXArray, attnMask: MLXArray? = nil) -> MLXArray {
+        let (B, H, W, C) = (x.shape[0], x.shape[1], x.shape[2], x.shape[3])
+
+        let q = queryProj(x)
+        let qReshaped = reshapeProjectedQuery(q, numHeads: numHeads, keyDim: keyDim)
+
+        var k = x
+        if let keyDownConv = keyDownConv {
+            k = keyDownConv(k)
+        }
+        if let keyNorm = keyNorm {
+            k = keyNorm(k)
+        }
+        k = keyProj(k)
+        let kReshaped = reshapeInput(k)
+
+        var v = x
+        if let valueDownConv = valueDownConv {
+            v = valueDownConv(v)
+        }
+        if let valueNorm = valueNorm {
+            v = valueNorm(v)
+        }
+        v = valueProj(v)
+        let vReshaped = reshapeInput(v)
+
+        let o: MLXArray
+        if fusedAttn {
+            o = MLXFast.scaledDotProductAttention(
+                queries: qReshaped,
+                keys: kReshaped,
+                values: vReshaped,
+                scale: scale,
+                mask: .none
+            )
+        } else {
+            fatalError("Unfused attention not implemented")
+        }
+
+        let oReshaped = reshapeOutput(
+            o,
+            numHeads: numHeads,
+            hPx: H / queryStrides.0,
+            wPx: W / queryStrides.1
+        )
+
+        return outputProj(oReshaped)
+    }
+}
+
+// MARK: - Mobile Attention
+private class MobileAttention: Module, UnaryLayer {
+    let hasSkip: Bool
+    let queryStrides: (Int, Int)
+    let kvStride: Int
+    let hasQueryStride: Bool
+
+    @ModuleInfo var norm: RMSNormAct2d
+    @ModuleInfo var attn: MultiQueryAttention2d
+    @ModuleInfo var layerScale: UnaryLayer
+    @ModuleInfo var dropPath: Identity
+
+    init(
+        inChannels: Int,
+        outChannels: Int,
+        stride: Int = 1,
+        dwKernelSize: Int = 3,
+        dilation: Int = 1,
+        groupSize: Int = 1,
+        padType: String = "",
+        numHeads: Int = 8,
+        keyDim: Int = 64,
+        valueDim: Int = 64,
+        useMultiQuery: Bool = true,
+        queryStrides: (Int, Int) = (1, 1),
+        kvStride: Int = 1,
+        cpeDwKernelSize: Int = 3,
+        noskip: Bool = false,
+        actLayer: Module? = nil,
+        aaLayer: Module? = nil,
+        dropPathRate: Float = 0.0,
+        attnDrop: Float = 0.0,
+        projDrop: Float = 0.0,
+        layerScaleInitValue: Float? = 1e-5,
+        useBias: Bool = false
+    ) {
+        self.hasSkip = (stride == 1 && inChannels == outChannels) && !noskip
+        self.queryStrides = queryStrides
+        self.kvStride = kvStride
+        self.hasQueryStride = queryStrides.0 > 1 || queryStrides.1 > 1
+
+        // Normalization layer
+        self._norm.wrappedValue = RMSNormAct2d(
+            numChannels: inChannels,
+            eps: 1e-05,
+            applyAct: false
+        )
+
+        // Attention layer
+        if useMultiQuery {
+            self._attn.wrappedValue = MultiQueryAttention2d(
+                dim: inChannels,
+                dimOut: outChannels,
+                numHeads: numHeads,
+                keyDim: keyDim,
+                valueDim: valueDim,
+                queryStrides: queryStrides,
+                kvStride: kvStride,
+                dilation: dilation,
+                padding: padType,
+                dwKernelSize: dwKernelSize,
+                attnDrop: attnDrop,
+                projDrop: projDrop
+            )
+        } else {
+            fatalError("Attention not implemented")
+        }
+
+        // Layer scaling
+        if let layerScaleInitValue = layerScaleInitValue {
+            self._layerScale.wrappedValue = LayerScale2d(
+                dim: outChannels, initValues: layerScaleInitValue)
+        } else {
+            self._layerScale.wrappedValue = Identity()
+        }
+
+        // Drop path for residual connection
+        self._dropPath.wrappedValue = Identity()
+
+        super.init()
+    }
+
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let shortcut = x
+        var result = norm(x)
+        result = attn(result)
+        result = layerScale(result)
+
+        // Apply skip connection if available
+        if hasSkip {
+            result = dropPath(result) + shortcut
+        }
+
+        return result
+    }
+}
+
+// MARK: - Configuration Classes
+private struct EdgeResidualConfig {
+    let kernelSize: Int
+    let filters: Int
+    let strides: Int
+    let expandRatio: Float
+    let isMultiscale: Bool
+
+    init(
+        kernelSize: Int = 3, filters: Int = 32, strides: Int = 1, expandRatio: Float = 4.0,
+        isMultiscale: Bool = false
+    ) {
+        self.kernelSize = kernelSize
+        self.filters = filters
+        self.strides = strides
+        self.expandRatio = expandRatio
+        self.isMultiscale = isMultiscale
+    }
+}
+
+private func _er(
+    _ kernelSize: Int, _ filters: Int, _ strides: Int = 1, _ expandRatio: Float = 4.0,
+    _ isMultiscale: Bool = false
+) -> EdgeResidualConfig {
+    return EdgeResidualConfig(
+        kernelSize: kernelSize, filters: filters, strides: strides, expandRatio: expandRatio,
+        isMultiscale: isMultiscale)
+}
+
+private struct UniversalInvertedResidualConfig {
+    let startDwKernelSize: Int
+    let midDwKernelSize: Int
+    let filters: Int
+    let strides: Int
+    let expandRatio: Float
+    let isMultiscale: Bool
+
+    init(
+        startDwKernelSize: Int, midDwKernelSize: Int, filters: Int, strides: Int = 1,
+        expandRatio: Float = 4.0, isMultiscale: Bool = false
+    ) {
+        self.startDwKernelSize = startDwKernelSize
+        self.midDwKernelSize = midDwKernelSize
+        self.filters = filters
+        self.strides = strides
+        self.expandRatio = expandRatio
+        self.isMultiscale = isMultiscale
+    }
+}
+
+private func _uir(
+    _ startDwKernelSize: Int, _ midDwKernelSize: Int, _ filters: Int, _ strides: Int = 1,
+    _ expandRatio: Float = 4.0, _ isMultiscale: Bool = false
+) -> UniversalInvertedResidualConfig {
+    return UniversalInvertedResidualConfig(
+        startDwKernelSize: startDwKernelSize,
+        midDwKernelSize: midDwKernelSize,
+        filters: filters,
+        strides: strides,
+        expandRatio: expandRatio,
+        isMultiscale: isMultiscale
+    )
+}
+
+private struct MultiQueryAttentionBlockConfig {
+    let numHeads: Int
+    let kvDim: Int
+    let kvStrides: Int
+    let mmqaAvgPoolKv: Bool
+    let isMultiscale: Bool
+
+    init(
+        numHeads: Int = 8, kvDim: Int = 16, kvStrides: Int = 1, mmqaAvgPoolKv: Bool = false,
+        isMultiscale: Bool = false
+    ) {
+        self.numHeads = numHeads
+        self.kvDim = kvDim
+        self.kvStrides = kvStrides
+        self.mmqaAvgPoolKv = mmqaAvgPoolKv
+        self.isMultiscale = isMultiscale
+    }
+}
+
+private func _mmqa(
+    _ numHeads: Int, _ kvDim: Int, _ kvStrides: Int, _ mmqaAvgPoolKv: Bool = false,
+    _ isMultiscale: Bool = false
+) -> MultiQueryAttentionBlockConfig {
+    return MultiQueryAttentionBlockConfig(
+        numHeads: numHeads,
+        kvDim: kvDim,
+        kvStrides: kvStrides,
+        mmqaAvgPoolKv: mmqaAvgPoolKv,
+        isMultiscale: isMultiscale
+    )
+}
+
+// MARK: - MobileNet Definition
+private func gemma3nMobilenetDef() -> [[Any]] {
+    return [
+        // Stage 1: Edge Residuals
+        [_er(3, 128, 2)] + Array(repeating: _er(3, 128, 1), count: 2),
+        // Stage 2: Universal Inverted Residuals
+        [_uir(3, 5, 256, 2, 6.0)] + [5, 3, 5, 3].map { _uir($0, 0, 256) },
+        // Stage 3: Universal Inverted Residuals with Multi-Query Attention
+        [_uir(5, 5, 640, 2, 6.0)]
+            + Array(repeating: _uir(5, 0, 640), count: 7)
+            + [_uir(0, 0, 640, 1, 1.0)]
+            + Array(repeating: [_mmqa(12, 64, 2), _uir(0, 0, 640, 1, 2.0)], count: 13).flatMap {
+                $0
+            }
+            + [_mmqa(12, 64, 2), _uir(0, 0, 640, 1, 2.0, true)],
+        // Stage 4: Universal Inverted Residuals with Multi-Query Attention
+        [_uir(5, 5, 1280, 2, 6.0)]
+            + Array(repeating: [_mmqa(16, 96, 1), _uir(0, 0, 1280, 1, 2.0)], count: 18).flatMap {
+                $0
+            }
+            + [_mmqa(16, 96, 1), _uir(0, 0, 1280, 1, 2.0, true)],
+    ]
+}
+
+// MARK: - Multi-Scale Fusion Adapter
+private class MobileNetV5MultiScaleFusionAdapter: Module {
+    let inChannels: Int
+    let outChannels: Int
+    let outputResolution: (Int, Int)
+    let expansionRatio: Float
+    let interpolationMode: String
+    let useLayerScale: Bool
+    let layerScaleInitValue: Float
+    let noskip: Bool
+
+    @ModuleInfo var ffn: UniversalInvertedResidual
+    @ModuleInfo var norm: RMSNormAct2d
+    @ModuleInfo var avgPool: AvgPool2d
+
+    init(
+        inChannels: [Int],
+        outChannels: Int,
+        outputResolution: Int,
+        expansionRatio: Float = 2.0,
+        interpolationMode: String = "nearest",
+        useLayerScale: Bool = false,
+        layerScaleInitValue: Float = 1e-5,
+        noskip: Bool = true
+    ) {
+        let inChannelsSum = inChannels.reduce(0, +)
+        self.inChannels = inChannelsSum
+        self.outChannels = outChannels
+        self.outputResolution = to2Tuple(outputResolution)
+        self.expansionRatio = expansionRatio
+        self.interpolationMode = interpolationMode
+        self.useLayerScale = useLayerScale
+        self.layerScaleInitValue = layerScaleInitValue
+        self.noskip = noskip
+
+        self._ffn.wrappedValue = UniversalInvertedResidual(
+            inChannels: inChannelsSum,
+            outChannels: outChannels,
+            dwKernelSizeStart: 0,
+            dwKernelSizeMid: 0,
+            noskip: noskip,
+            expRatio: expansionRatio,
+            layerScaleInitValue: useLayerScale ? layerScaleInitValue : nil
+        )
+
+        self._norm.wrappedValue = RMSNormAct2d(numChannels: outChannels, eps: 1e-6, applyAct: false)
+
+        // For simplicity, using AvgPool2d for downsampling
+        self._avgPool.wrappedValue = AvgPool2d(kernelSize: IntOrPair(2), stride: IntOrPair(2))
+
+        super.init()
+    }
+
+    func callAsFunction(_ inputs: [MLXArray]) -> MLXArray {
+        // Convert from NHWC to NCHW for processing
+        let inputsNCHW = inputs.map { $0.transposed(0, 3, 1, 2) }
+
+        // Find the highest resolution (first input)
+        let highResolution = inputsNCHW[0].shape.suffix(2)
+        var resizedInputs: [MLXArray] = []
+
+        for img in inputsNCHW {
+            let imgShape = img.shape.suffix(2)
+            var resizedImg = img
+
+            // Resize if needed using nearest neighbor interpolation
+            if imgShape[0] < highResolution[0] || imgShape[1] < highResolution[1] {
+                // Simple nearest neighbor interpolation
+                let scaleH = Float(highResolution[0]) / Float(imgShape[0])
+                let scaleW = Float(highResolution[1]) / Float(imgShape[1])
+                // For simplicity, just repeat the image - in practice you'd implement proper interpolation
+                resizedImg = img
+            }
+
+            resizedInputs.append(resizedImg)
+        }
+
+        // Concatenate on channel dimension
+        let channelCatImgs = concatenated(resizedInputs, axis: 1)
+
+        // Convert back to NHWC for MLX processing
+        let channelCatImgsNHWC = channelCatImgs.transposed(0, 2, 3, 1)
+        var img = ffn(channelCatImgsNHWC)
+
+        // Handle output resolution adjustment
+        let currentResolution = (img.shape[1], img.shape[2])
+        if currentResolution.0 != outputResolution.0 || currentResolution.1 != outputResolution.1 {
+            if currentResolution.0 % outputResolution.0 != 0
+                || currentResolution.1 % outputResolution.1 != 0
+            {
+                // Use bicubic interpolation to match Python implementation
+                img = bicubicInterpolate(img, to: outputResolution)
+            } else {
+                let hStrides = currentResolution.0 / outputResolution.0
+                let wStrides = currentResolution.1 / outputResolution.1
+
+                // Convert to NCHW for AvgPool2d
+                let imgNCHW = img.transposed(0, 3, 1, 2)
+                let pooled = AvgPool2d(
+                    kernelSize: IntOrPair(hStrides),
+                    stride: IntOrPair(hStrides)
+                )(imgNCHW)
+                img = pooled.transposed(0, 2, 3, 1)
+            }
+
+            img = noskip ? img : norm(img)
+        }
+
+        return img
+    }
+}
+
+// MARK: - Vision Tower
+private class VisionTower: Module {
+    @ModuleInfo var convStem: ConvNormAct
+    @ModuleInfo var blocks: [[UnaryLayer]]
+    @ModuleInfo var msfa: MobileNetV5MultiScaleFusionAdapter
+
+    let numFeatures: Int
+    let headHiddenSize: Int
+    let msfaIndices: (Int, Int)
+    let msfaOutputResolution: (Int, Int)
+
+    init(config: VisionConfig) {
+        self._convStem.wrappedValue = ConvNormAct(
+            inChannels: 3,
+            outChannels: 64,
+            kernelSize: 3,
+            stride: 2,
+            padding: 1,
+            eps: 1e-05
+        )
+
+        self.msfaIndices = (3, 4)
+        self.msfaOutputResolution = (16, 16)
+
+        let (numFeatures, blocks) = Self.buildBlocks(convStemOutChannels: 64)
+        self.numFeatures = numFeatures
+        self.headHiddenSize = numFeatures
+        self._blocks.wrappedValue = blocks
+
+        self._msfa.wrappedValue = MobileNetV5MultiScaleFusionAdapter(
+            inChannels: [1920],
+            outChannels: 2048,
+            outputResolution: msfaOutputResolution.0
+        )
+
+        super.init()
+    }
+
+    static func buildBlocks(convStemOutChannels: Int) -> (Int, [[UnaryLayer]]) {
+        var blocks: [[UnaryLayer]] = []
+        var inChannels = convStemOutChannels
+
+        for (stage, blockConfigs) in gemma3nMobilenetDef().enumerated() {
+            var blockGroup: [UnaryLayer] = []
+
+            for config in blockConfigs {
+                if let edgeConfig = config as? EdgeResidualConfig {
+                    let block = EdgeResidual(
+                        inChannels: inChannels,
+                        outChannels: edgeConfig.filters,
+                        expKernelSize: edgeConfig.kernelSize,
+                        stride: edgeConfig.strides,
+                        expandRatio: edgeConfig.expandRatio
+                    )
+                    inChannels = edgeConfig.filters
+                    blockGroup.append(block)
+                } else if let uirConfig = config as? UniversalInvertedResidualConfig {
+                    let block = UniversalInvertedResidual(
+                        inChannels: inChannels,
+                        outChannels: uirConfig.filters,
+                        dwKernelSizeStart: uirConfig.startDwKernelSize,
+                        dwKernelSizeMid: uirConfig.midDwKernelSize,
+                        stride: uirConfig.strides,
+                        expRatio: uirConfig.expandRatio
+                    )
+                    inChannels = uirConfig.filters
+                    blockGroup.append(block)
+                } else if let attentionConfig = config as? MultiQueryAttentionBlockConfig {
+                    let block = MobileAttention(
+                        inChannels: inChannels,
+                        outChannels: inChannels,
+                        stride: 1,
+                        numHeads: attentionConfig.numHeads,
+                        keyDim: attentionConfig.kvDim,
+                        valueDim: attentionConfig.kvDim,
+                        kvStride: attentionConfig.kvStrides,
+                        actLayer: nil
+                    )
+                    blockGroup.append(block)
+                }
+            }
+            blocks.append(blockGroup)
+        }
+
+        return (inChannels, blocks)
+    }
+
+    func callAsFunction(
+        _ x: MLXArray,
+        outputHiddenStates: Bool = false
+    ) -> MLXArray {
+        var featIdx = 0
+        // Convert from NCHW to NHWC
+        var result = x.transposed(0, 2, 3, 1)
+        result = convStem(result)
+        var intermediates: [MLXArray] = []
+
+        if msfaIndices.0 == featIdx || msfaIndices.1 == featIdx {
+            intermediates.append(result)
+        }
+
+        // MBV5 is constructed of 4 stages, each stage is a group of blocks
+        for blockGroup in blocks {
+            featIdx += 1
+            for block in blockGroup {
+                result = block(result)
+            }
+
+            if msfaIndices.0 == featIdx || msfaIndices.1 == featIdx {
+                intermediates.append(result)
+            }
+        }
+
+        result = msfa(intermediates)
+        return result
+    }
+}
+
+// MARK: - Complete Vision Model
+private class Gemma3nVisionModel: Module {
+    let modelType: String
+    @ModuleInfo var timmModel: VisionTower
+
+    init(config: VisionConfig) {
+        self.modelType = config.modelType
+        self._timmModel.wrappedValue = VisionTower(config: config)
+        super.init()
+    }
+
+    func callAsFunction(
+        _ x: MLXArray,
+        outputHiddenStates: Bool = false
+    ) -> MLXArray {
+        return timmModel(x, outputHiddenStates: outputHiddenStates)
+    }
+
+    func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var sanitizedWeights = [String: MLXArray]()
+        var skipTranspose = false
+
+        // Check if weights are already in MLX format
+        if let convWeight = weights["vision_tower.timm_model.blocks.0.0.conv_exp.weight"] {
+            let (_, H, _, C) = (
+                convWeight.shape[0], convWeight.shape[1], convWeight.shape[2], convWeight.shape[3]
+            )
+            if C > H {
+                skipTranspose = true
+            }
+        }
+
+        for (k, v) in weights {
+            // PyTorch conv2d weight: [out_channels, in_channels, kH, kW]
+            // MLX conv2d weight: [out_channels, kH, KW, in_channels]
+            if (k.contains("conv") && k.contains("weight"))
+                || (k.contains("attn") && k.contains("proj.weight"))
+            {
+                if v.shape.count == 4 && !skipTranspose {
+                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
+                } else {
+                    sanitizedWeights[k] = v
+                }
+            } else {
+                sanitizedWeights[k] = v
+            }
+        }
+
+        return sanitizedWeights
+    }
+}
+
+// MARK: - Complete Audio Model
+private class Gemma3nAudioModel: Module {
+    let config: AudioConfig
+
+    @ModuleInfo var subsampleConvProjection: Gemma3nAudioSubSampleConvProjection
+    @ModuleInfo var conformer: [Gemma3nAudioConformerBlock]
+
+    init(config: AudioConfig) {
+        self.config = config
+
+        self._subsampleConvProjection.wrappedValue = Gemma3nAudioSubSampleConvProjection(
+            config: config)
+        self._conformer.wrappedValue = (0 ..< config.confNumHiddenLayers).map { _ in
+            Gemma3nAudioConformerBlock(config: config)
+        }
+
+        super.init()
+    }
+
+    func callAsFunction(
+        _ audioMel: MLXArray,
+        _ audioMelMask: MLXArray
+    ) -> (MLXArray, MLXArray) {
+        var audioencodings = subsampleConvProjection(audioMel)
+
+        // Subsample the input audio_mel_mask to match the time dimension
+        let tSub = audioencodings.shape[1]
+
+        var timeStrideProduct = 1
+        for stridePairIdx in 0 ..< config.sscpConvStrideSize.count {
+            timeStrideProduct *= config.sscpConvStrideSize[stridePairIdx][0]
+        }
+
+        let indices = MLXArray(0 ..< tSub) * timeStrideProduct
+        let clippedIndices = clip(indices, max: MLXArray(audioMelMask.shape[1] - 1))
+
+        var currentMask: MLXArray
+        if audioMelMask.ndim > 1 && clippedIndices.ndim == 1 {
+            let expandedIndices = expandedDimensions(clippedIndices, axis: 0)
+            let broadcastIndices = broadcast(
+                expandedIndices,
+                to: [audioMelMask.shape[0], clippedIndices.shape[0]]
+            )
+            currentMask = take(audioMelMask, broadcastIndices.asType(.int32), axis: 1)
+        } else {
+            currentMask = take(audioMelMask, clippedIndices.asType(.int32), axis: 1)
+        }
+
+        // Adjust mask length if needed
+        if currentMask.shape[1] != tSub {
+            if currentMask.shape[1] > tSub {
+                currentMask = currentMask[0..., ..<tSub]
+            } else {
+                let paddingNeeded = tSub - currentMask.shape[1]
+                currentMask = padded(
+                    currentMask,
+                    widths: convertTorchToMLXPadWidth([0, paddingNeeded], Array(currentMask.shape))
+                )
+            }
+        }
+
+        for block in conformer {
+            audioencodings = block(audioencodings, currentMask)
+        }
+
+        if config.confReductionFactor > 1 {
+            let stride = config.confReductionFactor
+            audioencodings = audioencodings[0..., 0 ..< audioencodings.shape[1], stride, 0...]
+            currentMask = currentMask[0..., 0 ..< currentMask.shape[1], stride]
+        }
+
+        // Final masking
+        if currentMask.shape[1] != audioencodings.shape[1] {
+            let targetLen = audioencodings.shape[1]
+            let maskCurrentLen = currentMask.shape[1]
+            if targetLen > maskCurrentLen {
+                let paddingNeeded = targetLen - maskCurrentLen
+                currentMask = padded(
+                    currentMask,
+                    widths: convertTorchToMLXPadWidth([0, paddingNeeded], Array(currentMask.shape))
+                )
+            } else if maskCurrentLen > targetLen {
+                currentMask = currentMask[0..., ..<targetLen]
+            }
+        }
+
+        audioencodings = MLX.where(
+            expandedDimensions(currentMask, axis: -1),
+            MLXArray(0.0),
+            audioencodings
+        )
+
+        return (audioencodings, currentMask)
+    }
+
+    func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        var sanitizedWeights = [String: MLXArray]()
+
+        for (k, v) in weights {
+            if k.contains("conv.weight") {
+                if checkArrayShape(v) {
+                    sanitizedWeights[k] = v
+                } else {
+                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
+                }
+            } else if k.contains("conv1d.weight") {
+                if checkArrayShape(v) {
+                    sanitizedWeights[k] = v
+                } else {
+                    sanitizedWeights[k] = v.transposed(0, 2, 1)
+                }
+            } else {
+                sanitizedWeights[k] = v
+            }
+        }
+
+        return sanitizedWeights
+    }
+}
+
+// MARK: - LoRA Support
+
+extension Gemma3n: LoRAModel {
+    public func loraLinearLayers() -> LoRALinearLayers {
+        return languageModel.model.layers.map { layer in
+            (layer.selfAttn, ["q_proj", "v_proj"])
+        }
+    }
+}
+
+// MARK: - VLM Factory Configuration and Processor
+
+public struct Gemma3nConfiguration: Codable, Sendable {
+    public let textConfig: TextConfig
+    public let visionConfig: VisionConfig
+    public let audioConfig: AudioConfig
+    public let modelType: String
+    public let vocabSize: Int
+    public let ignoreIndex: Int
+    public let imageTokenIndex: Int
+    public let audioTokenId: Int
+    public let imageTokenId: Int
+    public let hiddenSize: Int
+    public let padTokenId: Int
+    public let visionSoftTokensPerImage: Int
+    public let audioSoftTokensPerImage: Int
+    public let eosTokenId: [Int]?
+    public let quantization: QuantizationConfig?
+
+    public var vocabularySize: Int { vocabSize }
+
+    enum CodingKeys: String, CodingKey {
+        case textConfig = "text_config"
+        case visionConfig = "vision_config"
+        case audioConfig = "audio_config"
+        case modelType = "model_type"
+        case vocabSize = "vocab_size"
+        case ignoreIndex = "ignore_index"
+        case imageTokenIndex = "image_token_index"
+        case audioTokenId = "audio_token_id"
+        case imageTokenId = "image_token_id"
+        case hiddenSize = "hidden_size"
+        case padTokenId = "pad_token_id"
+        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case audioSoftTokensPerImage = "audio_soft_tokens_per_image"
+        case eosTokenId = "eos_token_id"
+        case quantization
+    }
+
+    public init(from modelConfig: ModelConfig, quantization: QuantizationConfig? = nil) {
+        self.textConfig = modelConfig.textConfig
+        self.visionConfig = modelConfig.visionConfig
+        self.audioConfig = modelConfig.audioConfig
+        self.modelType = modelConfig.modelType
+        self.vocabSize = modelConfig.vocabSize
+        self.ignoreIndex = modelConfig.ignoreIndex
+        self.imageTokenIndex = modelConfig.imageTokenIndex
+        self.audioTokenId = modelConfig.audioTokenId
+        self.imageTokenId = modelConfig.imageTokenId
+        self.hiddenSize = modelConfig.hiddenSize
+        self.padTokenId = modelConfig.padTokenId
+        self.visionSoftTokensPerImage = modelConfig.visionSoftTokensPerImage
+        self.audioSoftTokensPerImage = modelConfig.audioSoftTokensPerImage
+        self.eosTokenId = modelConfig.eosTokenId
+        self.quantization = quantization
+    }
+}
+
+public class Gemma3nProcessor: UserInputProcessor {
+    private let config: Gemma3nProcessorConfiguration
+    private let tokenizer: any Tokenizer
+
+    public init(_ config: Gemma3nProcessorConfiguration, tokenizer: any Tokenizer) {
+        self.config = config
+        self.tokenizer = tokenizer
+    }
+
+    public func preprocess(images: [CIImage], processing: UserInput.Processing?) throws -> (
+        MLXArray, THW
+    ) {
+        var userProcessing = processing ?? UserInput.Processing()
+        let targetSize = CGSize(width: config.imageSize, height: config.imageSize)
+        userProcessing.resize = targetSize
+
+        let processedImages = try images.map { image in
+            let processedImage = MediaProcessing.apply(image, processing: userProcessing)
+            let srgbImage = MediaProcessing.inSRGBToneCurveSpace(processedImage)
+            let resizedImage = try MediaProcessing.resampleBicubic(srgbImage, to: targetSize)
+            let normalizedImage = MediaProcessing.normalize(
+                resizedImage, mean: config.imageMeanTuple, std: config.imageStdTuple)
+            return MediaProcessing.asMLXArray(normalizedImage)
+        }
+
+        let pixelValues = concatenated(processedImages)
+        return (pixelValues, THW(images.count, config.imageSize, config.imageSize))
+    }
+
+    public func prepare(input: UserInput) async throws -> LMInput {
+        // Create structured messages for Gemma3n using LIST_WITH_IMAGE_TYPE_TEXT format
+        var messages: [[String: Any]] = []
+
+        if !input.images.isEmpty {
+            // Add image and text content in the format expected by Gemma3n
+            let content: [[String: Any]] = [
+                ["type": "image"],
+                ["type": "text", "text": input.prompt.description],
+            ]
+            messages.append(["role": "user", "content": content])
+        } else {
+            // Text-only message
+            messages.append(["role": "user", "content": input.prompt.description])
+        }
+
+        var promptTokens = try tokenizer.applyChatTemplate(messages: messages)
+
+        // Process images if any
+        var processedImage: LMInput.ProcessedImage?
+
+        if !input.images.isEmpty {
+            let imagePixelsAndFrames = try input.images.map {
+                try preprocess(images: [$0.asCIImage()], processing: input.processing)
+            }
+            let imagePixelsConcatenated = concatenated(imagePixelsAndFrames.map { $0.0 })
+            processedImage = LMInput.ProcessedImage(
+                pixels: imagePixelsConcatenated,
+                frames: imagePixelsAndFrames.map { $0.1 }
+            )
+
+            // Note: Unlike Gemma3, Gemma3n doesn't expand image tokens in the processor
+            // The model handles token mapping directly in get_input_embeddings
+        }
+
+        let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
+        let mask = ones(like: promptArray).asType(.int8)
+        return LMInput(
+            text: .init(tokens: promptArray, mask: mask),
+            image: processedImage
+        )
+    }
+}
+
+public struct Gemma3nProcessorConfiguration: Codable, Sendable {
+    public let processorClass: String
+    public let imageProcessorType: String?
+    public let doNormalize: Bool
+    public let doRescale: Bool
+    public let doResize: Bool
+    public let imageMean: [CGFloat]
+    public let imageStd: [CGFloat]
+    public let visionSoftTokensPerImage: Int
+    public let resample: Int
+    public let rescaleFactor: Float
+    public let size: ImageSize
+
+    // Optional fields that may be present in some configs
+    public let doConvertRgb: Bool?
+    public let doPanAndScan: Bool?
+
+    // Token identifiers - use default values that match Python implementation
+    public var imageTokenId: Int { 262145 }  // From Python: image_token_id = 262145
+    public var audioTokenId: Int { 262273 }  // From Python: audio_token_id = 262273
+
+    public struct ImageSize: Codable, Sendable {
+        public let height: Int
+        public let width: Int
+    }
+
+    // Computed properties for convenience
+    public var imageSize: Int { size.height }
+
+    public var imageMeanTuple: (CGFloat, CGFloat, CGFloat) {
+        (imageMean[0], imageMean[1], imageMean[2])
+    }
+
+    public var imageStdTuple: (CGFloat, CGFloat, CGFloat) {
+        (imageStd[0], imageStd[1], imageStd[2])
+    }
+
+    enum CodingKeys: String, CodingKey {
+        case processorClass = "processor_class"
+        case imageProcessorType = "image_processor_type"
+        case doNormalize = "do_normalize"
+        case doRescale = "do_rescale"
+        case doResize = "do_resize"
+        case doConvertRgb = "do_convert_rgb"
+        case doPanAndScan = "do_pan_and_scan"
+        case imageMean = "image_mean"
+        case imageStd = "image_std"
+        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case resample
+        case rescaleFactor = "rescale_factor"
+        case size
+    }
+}
+
+extension Gemma3n {
+    public convenience init(_ config: Gemma3nConfiguration) {
+        let modelConfig = ModelConfig(
+            textConfig: config.textConfig,
+            visionConfig: config.visionConfig,
+            audioConfig: config.audioConfig,
+            modelType: config.modelType,
+            vocabSize: config.vocabSize,
+            ignoreIndex: config.ignoreIndex,
+            imageTokenIndex: config.imageTokenIndex,
+            audioTokenId: config.audioTokenId,
+            imageTokenId: config.imageTokenId,
+            hiddenSize: config.hiddenSize,
+            padTokenId: config.padTokenId,
+            visionSoftTokensPerImage: config.visionSoftTokensPerImage,
+            audioSoftTokensPerImage: config.audioSoftTokensPerImage,
+            eosTokenId: config.eosTokenId
+        )
+        self.init(modelConfig)
+    }
+}
diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
index bf57d21d..d56a9338 100644
--- a/Libraries/MLXVLM/VLMModelFactory.swift
+++ b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -86,6 +86,7 @@ public class VLMTypeRegistry: ModelTypeRegistry, @unchecked Sendable {
             "qwen2_5_vl": create(Qwen25VLConfiguration.self, Qwen25VL.init),
             "idefics3": create(Idefics3Configuration.self, Idefics3.init),
             "gemma3": create(Gemma3Configuration.self, Gemma3.init),
+            "gemma3n": create(Gemma3nConfiguration.self, Gemma3n.init),
             "smolvlm": create(SmolVLM2Configuration.self, SmolVLM2.init),
         ]
     }
@@ -111,6 +112,8 @@ public class VLMProcessorTypeRegistry: ProcessorTypeRegistry, @unchecked Sendabl
                 Idefics3ProcessorConfiguration.self, Idefics3Processor.init),
             "Gemma3Processor": create(
                 Gemma3ProcessorConfiguration.self, Gemma3Processor.init),
+            "Gemma3nProcessor": create(
+                Gemma3nProcessorConfiguration.self, Gemma3nProcessor.init),
             "SmolVLMProcessor": create(
                 SmolVLMProcessorConfiguration.self, SmolVLMProcessor.init),
         ]
@@ -166,6 +169,18 @@ public class VLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         extraEOSTokens: ["<end_of_turn>"]
     )
 
+    static public let gemma3n_E2B_instruct = ModelConfiguration(
+        id: "mlx-community/gemma-3n-E2B-it-bf16",
+        defaultPrompt: "Describe this image.",
+        extraEOSTokens: ["<end_of_turn>"]
+    )
+
+    static public let gemma3n_E4B_instruct = ModelConfiguration(
+        id: "mlx-community/gemma-3n-E4B-it-bf16",
+        defaultPrompt: "Describe this image.",
+        extraEOSTokens: ["<end_of_turn>"]
+    )
+
     static public let smolvlm = ModelConfiguration(
         id: "HuggingFaceTB/SmolVLM2-500M-Video-Instruct-mlx",
         defaultPrompt:
@@ -181,6 +196,8 @@ public class VLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             gemma3_4B_qat_4bit,
             gemma3_12B_qat_4bit,
             gemma3_27B_qat_4bit,
+            gemma3n_E2B_instruct,
+            gemma3n_E4B_instruct,
             smolvlm,
         ]
     }

From 15e6693a30240137f2408e56a91451cc9d19db42 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Fri, 27 Jun 2025 11:53:29 +0200
Subject: [PATCH 02/19] Test with llm-tool

---
 Tools/llm-tool/LLMTool.swift                         | 12 +++---------
 .../xcshareddata/xcschemes/llm-tool.xcscheme         |  4 ++++
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/Tools/llm-tool/LLMTool.swift b/Tools/llm-tool/LLMTool.swift
index 8e2c3862..4140594f 100644
--- a/Tools/llm-tool/LLMTool.swift
+++ b/Tools/llm-tool/LLMTool.swift
@@ -302,15 +302,9 @@ struct EvaluateCommand: AsyncParsableCommand {
         let modelFactory: ModelFactory
         let defaultModel: ModelConfiguration
 
-        // Switch between LLM and VLM based on presence of media
-        let vlm = !media.image.isEmpty || !media.video.isEmpty
-        if vlm {
-            modelFactory = VLMModelFactory.shared
-            defaultModel = MLXVLM.VLMRegistry.qwen2VL2BInstruct4Bit
-        } else {
-            modelFactory = LLMModelFactory.shared
-            defaultModel = MLXLLM.LLMRegistry.mistral7B4bit
-        }
+        // Always use VLM factory and gemma3n_E2B_instruct for testing
+        modelFactory = VLMModelFactory.shared
+        defaultModel = MLXVLM.VLMRegistry.gemma3n_E2B_instruct
 
         // Load the model
         let modelContainer = try await memory.start { [args] in
diff --git a/mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme b/mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme
index a092bee2..85f1d36d 100644
--- a/mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme
+++ b/mlx-swift-examples.xcodeproj/xcshareddata/xcschemes/llm-tool.xcscheme
@@ -69,6 +69,10 @@
          </CommandLineArgument>
          <CommandLineArgument
             argument = "--model mlx-community/Qwen3-1.7B-4bit --prompt &quot;Explain quantum computing in simple terms&quot; --max-tokens 100 --kv-bits 4"
+            isEnabled = "NO">
+         </CommandLineArgument>
+         <CommandLineArgument
+            argument = "--prompt &quot;Why is the sky blue?&quot; --extra-eos-token &quot;&lt;end_of_turn&gt;&quot;"
             isEnabled = "YES">
          </CommandLineArgument>
          <CommandLineArgument

From 456e4a1279dc3010d4a7e90f49dc45a062a2d2f1 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Fri, 27 Jun 2025 12:35:49 +0200
Subject: [PATCH 03/19] Fix configs

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 573 ++++++++++++++++----------
 1 file changed, 349 insertions(+), 224 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 3ef06b1c..5b11240c 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -23,76 +23,29 @@ public protocol MultimodalConfig {
 }
 
 public struct AudioConfig: Codable, Sendable, MultimodalConfig {
-    public let inputFeatSize: Int
-    public let hiddenSize: Int
-    public let confAttentionChunkSize: Int
-    public let confAttentionContextLeft: Int
-    public let confAttentionContextRight: Int
-    public let confAttentionInvalidLogitsValue: Float
-    public let confAttentionLogitCap: Float
-    public let confNumAttentionHeads: Int
-    public let confNumHiddenLayers: Int
-    public let confConvKernelSize: Int
-    public let confPositionalBiasSize: Int
-    public let confReductionFactor: Int
-    public let confResidualWeight: Float
-    public let sscpConvChannelSize: [Int]
-    public let sscpConvGroupNormEps: Float
-    public let sscpConvKernelSize: [[Int]]
-    public let sscpConvStrideSize: [[Int]]
-    public let vocabSize: Int
-    public let sscpConvEps: Float
-    public let rmsNormEps: Float
-    public let gradientClipping: Float
-    public let vocabOffset: Int
-
-    public init(
-        inputFeatSize: Int = 80,
-        hiddenSize: Int = 1536,
-        confAttentionChunkSize: Int = 12,
-        confAttentionContextLeft: Int = 13,
-        confAttentionContextRight: Int = 0,
-        confAttentionInvalidLogitsValue: Float = -1e9,
-        confAttentionLogitCap: Float = 50.0,
-        confNumAttentionHeads: Int = 8,
-        confNumHiddenLayers: Int = 12,
-        confConvKernelSize: Int = 5,
-        confPositionalBiasSize: Int = 256,
-        confReductionFactor: Int = 4,
-        confResidualWeight: Float = 0.5,
-        sscpConvChannelSize: [Int] = [128, 32],
-        sscpConvGroupNormEps: Float = 1e-3,
-        sscpConvKernelSize: [[Int]] = [[3, 3], [3, 3]],
-        sscpConvStrideSize: [[Int]] = [[2, 2], [2, 2]],
-        vocabSize: Int = 128,
-        sscpConvEps: Float = 1e-3,
-        rmsNormEps: Float = 1e-6,
-        gradientClipping: Float = 10000000000.0,
-        vocabOffset: Int = 262272  // 262_144 + 128 (text vocab size + vision vocab size)
-    ) {
-        self.inputFeatSize = inputFeatSize
-        self.hiddenSize = hiddenSize
-        self.confAttentionChunkSize = confAttentionChunkSize
-        self.confAttentionContextLeft = confAttentionContextLeft
-        self.confAttentionContextRight = confAttentionContextRight
-        self.confAttentionInvalidLogitsValue = confAttentionInvalidLogitsValue
-        self.confAttentionLogitCap = confAttentionLogitCap
-        self.confNumAttentionHeads = confNumAttentionHeads
-        self.confNumHiddenLayers = confNumHiddenLayers
-        self.confConvKernelSize = confConvKernelSize
-        self.confPositionalBiasSize = confPositionalBiasSize
-        self.confReductionFactor = confReductionFactor
-        self.confResidualWeight = confResidualWeight
-        self.sscpConvChannelSize = sscpConvChannelSize
-        self.sscpConvGroupNormEps = sscpConvGroupNormEps
-        self.sscpConvKernelSize = sscpConvKernelSize
-        self.sscpConvStrideSize = sscpConvStrideSize
-        self.vocabSize = vocabSize
-        self.sscpConvEps = sscpConvEps
-        self.rmsNormEps = rmsNormEps
-        self.gradientClipping = gradientClipping
-        self.vocabOffset = vocabOffset
-    }
+    // Constants with default values (always present)
+    public let inputFeatSize: Int = 80
+    public let hiddenSize: Int = 1536
+    public let confAttentionChunkSize: Int = 12
+    public let confAttentionContextLeft: Int = 13
+    public let confAttentionContextRight: Int = 0
+    public let confAttentionInvalidLogitsValue: Float = -1e9
+    public let confAttentionLogitCap: Float = 50.0
+    public let confNumAttentionHeads: Int = 8
+    public let confNumHiddenLayers: Int = 12
+    public let confConvKernelSize: Int = 5
+    public let confPositionalBiasSize: Int = 256
+    public let confReductionFactor: Int = 4
+    public let confResidualWeight: Float = 0.5
+    public let sscpConvChannelSize: [Int] = [128, 32]
+    public let sscpConvGroupNormEps: Float = 1e-3
+    public let sscpConvKernelSize: [[Int]] = [[3, 3], [3, 3]]
+    public let sscpConvStrideSize: [[Int]] = [[2, 2], [2, 2]]
+    public let vocabSize: Int = 128
+    public let sscpConvEps: Float = 1e-3
+    public let rmsNormEps: Float = 1e-6
+    public let gradientClipping: Float = 10000000000.0
+    public let vocabOffset: Int = 262272  // 262_144 + 128 (text vocab size + vision vocab size)
 
     enum CodingKeys: String, CodingKey {
         case inputFeatSize = "input_feat_size"
@@ -121,43 +74,18 @@ public struct AudioConfig: Codable, Sendable, MultimodalConfig {
 }
 
 public struct VisionConfig: Codable, Sendable, MultimodalConfig {
-    public let modelType: String
-    public let numHiddenLayers: Int
-    public let hiddenSize: Int
-    public let intermediateSize: Int
-    public let numAttentionHeads: Int
-    public let patchSize: Int
-    public let imageSize: Int
-    public let numChannels: Int
-    public let rmsNormEps: Float
-    public let vocabSize: Int
-    public let vocabOffset: Int
-
-    public init(
-        modelType: String = "gemma3n_vision",
-        numHiddenLayers: Int = 12,
-        hiddenSize: Int = 2048,
-        intermediateSize: Int = 8192,
-        numAttentionHeads: Int = 16,
-        patchSize: Int = 16,
-        imageSize: Int = 224,
-        numChannels: Int = 3,
-        rmsNormEps: Float = 1e-6,
-        vocabSize: Int = 128,
-        vocabOffset: Int = 262144
-    ) {
-        self.modelType = modelType
-        self.numHiddenLayers = numHiddenLayers
-        self.hiddenSize = hiddenSize
-        self.intermediateSize = intermediateSize
-        self.numAttentionHeads = numAttentionHeads
-        self.patchSize = patchSize
-        self.imageSize = imageSize
-        self.numChannels = numChannels
-        self.rmsNormEps = rmsNormEps
-        self.vocabSize = vocabSize
-        self.vocabOffset = vocabOffset
-    }
+    // Constants with default values (always present)
+    public let modelType: String = "gemma3n_vision"
+    public let numHiddenLayers: Int = 12
+    public let hiddenSize: Int = 2048
+    public let intermediateSize: Int = 8192
+    public let numAttentionHeads: Int = 16
+    public let patchSize: Int = 16
+    public let imageSize: Int = 224
+    public let numChannels: Int = 3
+    public let rmsNormEps: Float = 1e-6
+    public let vocabSize: Int = 128
+    public let vocabOffset: Int = 262144
 
     enum CodingKeys: String, CodingKey {
         case modelType = "model_type"
@@ -179,103 +107,281 @@ public struct TextConfig: Codable, Sendable {
     public let hiddenSize: Int
     public let numHiddenLayers: Int
     public let intermediateSize: [Int]
-    public let numAttentionHeads: Int
-    public let headDim: Int
-    public let rmsNormEps: Float
-    public let vocabSize: Int
-    public let vocabSizePerLayerInput: Int
-    public let numKeyValueHeads: Int
-    public let laurelRank: Int
-    public let fracSharedLayers: Float
-    public let altupActiveIdx: Int
-    public let padTokenId: Int
-    public let altupNumInputs: Int
+    private let _numAttentionHeads: Int?
+    private let _headDim: Int?
+    private let _rmsNormEps: Float?
+    private let _vocabSize: Int?
+    private let _vocabSizePerLayerInput: Int?
+    private let _numKeyValueHeads: Int?
+    private let _laurelRank: Int?
+    private let _fracSharedLayers: Float?
+    private let _altupActiveIdx: Int?
+    private let _padTokenId: Int?
+    private let _altupNumInputs: Int?
     public let altupCoefClip: Float?
-    public let altupCorrectScale: Bool
-    public let hiddenSizePerLayerInput: Int
-    public let ropeLocalBaseFreq: Float
-    public let ropeTraditional: Bool
-    public let ropeTheta: Float
-    public let queryPreAttnScalar: Float
-    public let slidingWindow: Int
+    private let _altupCorrectScale: Bool?
+    private let _hiddenSizePerLayerInput: Int?
+    private let _ropeLocalBaseFreq: Float?
+    private let _ropeTraditional: Bool?
+    private let _ropeTheta: Float?
+    private let _queryPreAttnScalar: Float?
+    private let _slidingWindow: Int?
     public let ropeScaling: [String: StringOrNumber]?
-    public let mmTokensPerImage: Int
-    public let slidingWindowPattern: Int
     public let activationSparsityPattern: [Float]?
-    public let finalLogitSoftcapping: Float
-    public let queryRescaleScalar: Float
-    public let numKvSharedLayers: Int
-    public let maxPositionEmbeddings: Int
-    public let attnLogitSoftcapping: Float
-    public let layerTypes: [String]
+    public let layerTypes: [String]?
+    private let _mmTokensPerImage: Int?
+    private let _slidingWindowPattern: Int?
+    private let _finalLogitSoftcapping: Float?
+    private let _queryRescaleScalar: Float?
+    private let _numKvSharedLayers: Int?
+    private let _maxPositionEmbeddings: Int?
+    private let _attnLogitSoftcapping: Float?
+    
+    // Computed properties with defaults
+    public var numAttentionHeads: Int {
+        _numAttentionHeads ?? 2
+    }
+    
+    public var headDim: Int {
+        _headDim ?? 256
+    }
+    
+    public var rmsNormEps: Float {
+        _rmsNormEps ?? 1.0e-6
+    }
+    
+    public var vocabSize: Int {
+        _vocabSize ?? 262400
+    }
+    
+    public var vocabSizePerLayerInput: Int {
+        _vocabSizePerLayerInput ?? 262144
+    }
+    
+    public var numKeyValueHeads: Int {
+        _numKeyValueHeads ?? 4
+    }
+    
+    public var laurelRank: Int {
+        _laurelRank ?? 64
+    }
+    
+    public var fracSharedLayers: Float {
+        _fracSharedLayers ?? 0.5
+    }
+    
+    public var altupActiveIdx: Int {
+        _altupActiveIdx ?? 0
+    }
+    
+    public var padTokenId: Int {
+        _padTokenId ?? 0
+    }
+    
+    public var altupNumInputs: Int {
+        _altupNumInputs ?? 4
+    }
+    
+    public var altupCorrectScale: Bool {
+        _altupCorrectScale ?? true
+    }
+    
+    public var hiddenSizePerLayerInput: Int {
+        _hiddenSizePerLayerInput ?? 1024
+    }
+    
+    public var ropeLocalBaseFreq: Float {
+        _ropeLocalBaseFreq ?? 10000.0
+    }
+    
+    public var ropeTraditional: Bool {
+        _ropeTraditional ?? false
+    }
+    
+    public var ropeTheta: Float {
+        _ropeTheta ?? 1000000.0
+    }
+    
+    public var queryPreAttnScalar: Float {
+        _queryPreAttnScalar ?? 0.0625
+    }
+    
+    public var slidingWindow: Int {
+        _slidingWindow ?? 1024
+    }
+    
+    public var mmTokensPerImage: Int {
+        _mmTokensPerImage ?? 256
+    }
+    
+    public var slidingWindowPattern: Int {
+        _slidingWindowPattern ?? 5
+    }
+    
+    public var finalLogitSoftcapping: Float {
+        _finalLogitSoftcapping ?? 30.0
+    }
+    
+    public var queryRescaleScalar: Float {
+        _queryRescaleScalar ?? 1.0
+    }
+    
+    public var numKvSharedLayers: Int {
+        _numKvSharedLayers ?? 0
+    }
+    
+    public var maxPositionEmbeddings: Int {
+        _maxPositionEmbeddings ?? 32768
+    }
+    
+    public var attnLogitSoftcapping: Float {
+        _attnLogitSoftcapping ?? 0.0
+    }
+
+
 
     enum CodingKeys: String, CodingKey {
         case modelType = "model_type"
         case hiddenSize = "hidden_size"
         case numHiddenLayers = "num_hidden_layers"
         case intermediateSize = "intermediate_size"
-        case numAttentionHeads = "num_attention_heads"
-        case headDim = "head_dim"
-        case rmsNormEps = "rms_norm_eps"
-        case vocabSize = "vocab_size"
-        case vocabSizePerLayerInput = "vocab_size_per_layer_input"
-        case numKeyValueHeads = "num_key_value_heads"
-        case laurelRank = "laurel_rank"
-        case fracSharedLayers = "frac_shared_layers"
-        case altupActiveIdx = "altup_active_idx"
-        case padTokenId = "pad_token_id"
-        case altupNumInputs = "altup_num_inputs"
+        case _numAttentionHeads = "num_attention_heads"
+        case _headDim = "head_dim"
+        case _rmsNormEps = "rms_norm_eps"
+        case _vocabSize = "vocab_size"
+        case _vocabSizePerLayerInput = "vocab_size_per_layer_input"
+        case _numKeyValueHeads = "num_key_value_heads"
+        case _laurelRank = "laurel_rank"
+        case _fracSharedLayers = "frac_shared_layers"
+        case _altupActiveIdx = "altup_active_idx"
+        case _padTokenId = "pad_token_id"
+        case _altupNumInputs = "altup_num_inputs"
         case altupCoefClip = "altup_coef_clip"
-        case altupCorrectScale = "altup_correct_scale"
-        case hiddenSizePerLayerInput = "hidden_size_per_layer_input"
-        case ropeLocalBaseFreq = "rope_local_base_freq"
-        case ropeTraditional = "rope_traditional"
-        case ropeTheta = "rope_theta"
-        case queryPreAttnScalar = "query_pre_attn_scalar"
-        case slidingWindow = "sliding_window"
+        case _altupCorrectScale = "altup_correct_scale"
+        case _hiddenSizePerLayerInput = "hidden_size_per_layer_input"
+        case _ropeLocalBaseFreq = "rope_local_base_freq"
+        case _ropeTraditional = "rope_traditional"
+        case _ropeTheta = "rope_theta"
+        case _queryPreAttnScalar = "query_pre_attn_scalar"
+        case _slidingWindow = "sliding_window"
         case ropeScaling = "rope_scaling"
-        case mmTokensPerImage = "mm_tokens_per_image"
-        case slidingWindowPattern = "sliding_window_pattern"
+        case _mmTokensPerImage = "mm_tokens_per_image"
+        case _slidingWindowPattern = "sliding_window_pattern"
         case activationSparsityPattern = "activation_sparsity_pattern"
-        case finalLogitSoftcapping = "final_logit_softcapping"
-        case queryRescaleScalar = "query_rescale_scalar"
-        case numKvSharedLayers = "num_kv_shared_layers"
-        case maxPositionEmbeddings = "max_position_embeddings"
-        case attnLogitSoftcapping = "attn_logit_softcapping"
+        case _finalLogitSoftcapping = "final_logit_softcapping"
+        case _queryRescaleScalar = "query_rescale_scalar"
+        case _numKvSharedLayers = "num_kv_shared_layers"
+        case _maxPositionEmbeddings = "max_position_embeddings"
+        case _attnLogitSoftcapping = "attn_logit_softcapping"
         case layerTypes = "layer_types"
     }
 }
 
 public struct ModelConfig: Codable, Sendable {
+    // Required configs (no defaults in Python)
     public let textConfig: TextConfig
     public let visionConfig: VisionConfig
     public let audioConfig: AudioConfig
     public let modelType: String
-    public let vocabSize: Int
-    public let ignoreIndex: Int
-    public let imageTokenIndex: Int
-    public let audioTokenId: Int
-    public let imageTokenId: Int
-    public let hiddenSize: Int
-    public let padTokenId: Int
-    public let visionSoftTokensPerImage: Int
-    public let audioSoftTokensPerImage: Int
+    
+    // Fields with default values (can be overridden from JSON)
+    private let _vocabSize: Int?
+    private let _ignoreIndex: Int?
+    private let _imageTokenIndex: Int?
+    private let _audioTokenId: Int?
+    private let _imageTokenId: Int?
+    private let _hiddenSize: Int?
+    private let _padTokenId: Int?
+    private let _visionSoftTokensPerImage: Int?
+    private let _audioSoftTokensPerImage: Int?
+    
+    // Optional field
     public let eosTokenId: [Int]?
+    
+    // Computed properties with defaults
+    public var vocabSize: Int {
+        _vocabSize ?? 257152
+    }
+    
+    public var ignoreIndex: Int {
+        _ignoreIndex ?? -100
+    }
+    
+    public var imageTokenIndex: Int {
+        _imageTokenIndex ?? 262145
+    }
+    
+    public var audioTokenId: Int {
+        _audioTokenId ?? 262273
+    }
+    
+    public var imageTokenId: Int {
+        _imageTokenId ?? 262145
+    }
+    
+    public var hiddenSize: Int {
+        _hiddenSize ?? 2048
+    }
+    
+    public var padTokenId: Int {
+        _padTokenId ?? 0
+    }
+    
+    public var visionSoftTokensPerImage: Int {
+        _visionSoftTokensPerImage ?? 256
+    }
+    
+    public var audioSoftTokensPerImage: Int {
+        _audioSoftTokensPerImage ?? 188
+    }
+    
+    // Custom initializer to allow manual construction
+    public init(
+        textConfig: TextConfig,
+        visionConfig: VisionConfig,
+        audioConfig: AudioConfig,
+        modelType: String,
+        vocabSize: Int? = nil,
+        ignoreIndex: Int? = nil,
+        imageTokenIndex: Int? = nil,
+        audioTokenId: Int? = nil,
+        imageTokenId: Int? = nil,
+        hiddenSize: Int? = nil,
+        padTokenId: Int? = nil,
+        visionSoftTokensPerImage: Int? = nil,
+        audioSoftTokensPerImage: Int? = nil,
+        eosTokenId: [Int]? = nil
+    ) {
+        self.textConfig = textConfig
+        self.visionConfig = visionConfig
+        self.audioConfig = audioConfig
+        self.modelType = modelType
+        self._vocabSize = vocabSize
+        self._ignoreIndex = ignoreIndex
+        self._imageTokenIndex = imageTokenIndex
+        self._audioTokenId = audioTokenId
+        self._imageTokenId = imageTokenId
+        self._hiddenSize = hiddenSize
+        self._padTokenId = padTokenId
+        self._visionSoftTokensPerImage = visionSoftTokensPerImage
+        self._audioSoftTokensPerImage = audioSoftTokensPerImage
+        self.eosTokenId = eosTokenId
+    }
 
     enum CodingKeys: String, CodingKey {
         case textConfig = "text_config"
         case visionConfig = "vision_config"
         case audioConfig = "audio_config"
         case modelType = "model_type"
-        case vocabSize = "vocab_size"
-        case ignoreIndex = "ignore_index"
-        case imageTokenIndex = "image_token_index"
-        case audioTokenId = "audio_token_id"
-        case imageTokenId = "image_token_id"
-        case hiddenSize = "hidden_size"
-        case padTokenId = "pad_token_id"
-        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
-        case audioSoftTokensPerImage = "audio_soft_tokens_per_image"
+        case _vocabSize = "vocab_size"
+        case _ignoreIndex = "ignore_index"
+        case _imageTokenIndex = "image_token_index"
+        case _audioTokenId = "audio_token_id"
+        case _imageTokenId = "image_token_id"
+        case _hiddenSize = "hidden_size"
+        case _padTokenId = "pad_token_id"
+        case _visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case _audioSoftTokensPerImage = "audio_soft_tokens_per_image"
         case eosTokenId = "eos_token_id"
     }
 }
@@ -445,7 +551,7 @@ private class Gemma3nAttention: Module {
     @ModuleInfo var vNorm: Gemma3nRMSNorm
 
     init(config: TextConfig, layerIdx: Int) {
-        self.isSliding = config.layerTypes[layerIdx] == "sliding_attention"
+        self.isSliding = (config.layerTypes ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[layerIdx] == "sliding_attention"
         self.attnLogitSoftcapping = config.attnLogitSoftcapping
 
         let dim = config.hiddenSize
@@ -760,7 +866,7 @@ private class Gemma3nDecoderLayer: Module {
         self.hiddenSizePerLayerInput = config.hiddenSizePerLayerInput
 
         self._selfAttn.wrappedValue = Gemma3nAttention(config: config, layerIdx: layerIdx)
-        self.isSliding = config.layerTypes[layerIdx] == "sliding_attention"
+        self.isSliding = (config.layerTypes ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[layerIdx] == "sliding_attention"
 
         self._mlp.wrappedValue = MLP(config: config, layerIdx: layerIdx)
         self._inputLayernorm.wrappedValue = Gemma3nRMSNorm(
@@ -1072,7 +1178,7 @@ private class Gemma3Model: Module {
         for (i, (layer, c)) in zip(layers[..<config.numHiddenLayers], cacheArray).enumerated() {
             let perLayerInput = finalPerLayerInputs[0..., 0..., i, 0...]
 
-            let isGlobal = config.layerTypes[i] == "global_attention"
+            let isGlobal = (config.layerTypes ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[i] == "global_attention"
 
             let localMask: MLXFast.ScaledDotProductAttentionMaskMode
             if let mask = mask {
@@ -1811,15 +1917,6 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
             visionConfig: visionConfig,
             audioConfig: audioConfig,
             modelType: configDict["model_type"] as? String ?? "gemma3n",
-            vocabSize: configDict["vocab_size"] as? Int ?? 257152,
-            ignoreIndex: configDict["ignore_index"] as? Int ?? -100,
-            imageTokenIndex: configDict["image_token_index"] as? Int ?? 262145,
-            audioTokenId: configDict["audio_token_id"] as? Int ?? 262273,
-            imageTokenId: configDict["image_token_id"] as? Int ?? 262145,
-            hiddenSize: configDict["hidden_size"] as? Int ?? 2048,
-            padTokenId: configDict["pad_token_id"] as? Int ?? 0,
-            visionSoftTokensPerImage: configDict["vision_soft_tokens_per_image"] as? Int ?? 256,
-            audioSoftTokensPerImage: configDict["audio_soft_tokens_per_image"] as? Int ?? 188,
             eosTokenId: configDict["eos_token_id"] as? [Int]
         )
 
@@ -3848,18 +3945,55 @@ public struct Gemma3nConfiguration: Codable, Sendable {
     public let visionConfig: VisionConfig
     public let audioConfig: AudioConfig
     public let modelType: String
-    public let vocabSize: Int
-    public let ignoreIndex: Int
-    public let imageTokenIndex: Int
-    public let audioTokenId: Int
-    public let imageTokenId: Int
-    public let hiddenSize: Int
-    public let padTokenId: Int
-    public let visionSoftTokensPerImage: Int
-    public let audioSoftTokensPerImage: Int
+    private let _vocabSize: Int?
+    private let _ignoreIndex: Int?
+    private let _imageTokenIndex: Int?
+    private let _audioTokenId: Int?
+    private let _imageTokenId: Int?
+    private let _hiddenSize: Int?
+    private let _padTokenId: Int?
+    private let _visionSoftTokensPerImage: Int?
+    private let _audioSoftTokensPerImage: Int?
     public let eosTokenId: [Int]?
     public let quantization: QuantizationConfig?
 
+    // Computed properties with defaults
+    public var vocabSize: Int {
+        _vocabSize ?? 257152
+    }
+    
+    public var ignoreIndex: Int {
+        _ignoreIndex ?? -100
+    }
+    
+    public var imageTokenIndex: Int {
+        _imageTokenIndex ?? 262145
+    }
+    
+    public var audioTokenId: Int {
+        _audioTokenId ?? 262273
+    }
+    
+    public var imageTokenId: Int {
+        _imageTokenId ?? 262145
+    }
+    
+    public var hiddenSize: Int {
+        _hiddenSize ?? 2048
+    }
+    
+    public var padTokenId: Int {
+        _padTokenId ?? 0
+    }
+    
+    public var visionSoftTokensPerImage: Int {
+        _visionSoftTokensPerImage ?? 256
+    }
+    
+    public var audioSoftTokensPerImage: Int {
+        _audioSoftTokensPerImage ?? 188
+    }
+
     public var vocabularySize: Int { vocabSize }
 
     enum CodingKeys: String, CodingKey {
@@ -3867,15 +4001,15 @@ public struct Gemma3nConfiguration: Codable, Sendable {
         case visionConfig = "vision_config"
         case audioConfig = "audio_config"
         case modelType = "model_type"
-        case vocabSize = "vocab_size"
-        case ignoreIndex = "ignore_index"
-        case imageTokenIndex = "image_token_index"
-        case audioTokenId = "audio_token_id"
-        case imageTokenId = "image_token_id"
-        case hiddenSize = "hidden_size"
-        case padTokenId = "pad_token_id"
-        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
-        case audioSoftTokensPerImage = "audio_soft_tokens_per_image"
+        case _vocabSize = "vocab_size"
+        case _ignoreIndex = "ignore_index"
+        case _imageTokenIndex = "image_token_index"
+        case _audioTokenId = "audio_token_id"
+        case _imageTokenId = "image_token_id"
+        case _hiddenSize = "hidden_size"
+        case _padTokenId = "pad_token_id"
+        case _visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case _audioSoftTokensPerImage = "audio_soft_tokens_per_image"
         case eosTokenId = "eos_token_id"
         case quantization
     }
@@ -3885,15 +4019,15 @@ public struct Gemma3nConfiguration: Codable, Sendable {
         self.visionConfig = modelConfig.visionConfig
         self.audioConfig = modelConfig.audioConfig
         self.modelType = modelConfig.modelType
-        self.vocabSize = modelConfig.vocabSize
-        self.ignoreIndex = modelConfig.ignoreIndex
-        self.imageTokenIndex = modelConfig.imageTokenIndex
-        self.audioTokenId = modelConfig.audioTokenId
-        self.imageTokenId = modelConfig.imageTokenId
-        self.hiddenSize = modelConfig.hiddenSize
-        self.padTokenId = modelConfig.padTokenId
-        self.visionSoftTokensPerImage = modelConfig.visionSoftTokensPerImage
-        self.audioSoftTokensPerImage = modelConfig.audioSoftTokensPerImage
+        self._vocabSize = modelConfig.vocabSize
+        self._ignoreIndex = modelConfig.ignoreIndex
+        self._imageTokenIndex = modelConfig.imageTokenIndex
+        self._audioTokenId = modelConfig.audioTokenId
+        self._imageTokenId = modelConfig.imageTokenId
+        self._hiddenSize = modelConfig.hiddenSize
+        self._padTokenId = modelConfig.padTokenId
+        self._visionSoftTokensPerImage = modelConfig.visionSoftTokensPerImage
+        self._audioSoftTokensPerImage = modelConfig.audioSoftTokensPerImage
         self.eosTokenId = modelConfig.eosTokenId
         self.quantization = quantization
     }
@@ -4033,15 +4167,6 @@ extension Gemma3n {
             visionConfig: config.visionConfig,
             audioConfig: config.audioConfig,
             modelType: config.modelType,
-            vocabSize: config.vocabSize,
-            ignoreIndex: config.ignoreIndex,
-            imageTokenIndex: config.imageTokenIndex,
-            audioTokenId: config.audioTokenId,
-            imageTokenId: config.imageTokenId,
-            hiddenSize: config.hiddenSize,
-            padTokenId: config.padTokenId,
-            visionSoftTokensPerImage: config.visionSoftTokensPerImage,
-            audioSoftTokensPerImage: config.audioSoftTokensPerImage,
             eosTokenId: config.eosTokenId
         )
         self.init(modelConfig)

From 1ef9b1825ce8ea0a97aa857336c1de9410b6b04e Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Fri, 27 Jun 2025 12:40:55 +0200
Subject: [PATCH 04/19] Fixing sanitization

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 476 +++++++++++++++++---------
 1 file changed, 322 insertions(+), 154 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 5b11240c..cc1f0dd5 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -136,110 +136,108 @@ public struct TextConfig: Codable, Sendable {
     private let _numKvSharedLayers: Int?
     private let _maxPositionEmbeddings: Int?
     private let _attnLogitSoftcapping: Float?
-    
+
     // Computed properties with defaults
     public var numAttentionHeads: Int {
         _numAttentionHeads ?? 2
     }
-    
+
     public var headDim: Int {
         _headDim ?? 256
     }
-    
+
     public var rmsNormEps: Float {
         _rmsNormEps ?? 1.0e-6
     }
-    
+
     public var vocabSize: Int {
         _vocabSize ?? 262400
     }
-    
+
     public var vocabSizePerLayerInput: Int {
         _vocabSizePerLayerInput ?? 262144
     }
-    
+
     public var numKeyValueHeads: Int {
         _numKeyValueHeads ?? 4
     }
-    
+
     public var laurelRank: Int {
         _laurelRank ?? 64
     }
-    
+
     public var fracSharedLayers: Float {
         _fracSharedLayers ?? 0.5
     }
-    
+
     public var altupActiveIdx: Int {
         _altupActiveIdx ?? 0
     }
-    
+
     public var padTokenId: Int {
         _padTokenId ?? 0
     }
-    
+
     public var altupNumInputs: Int {
         _altupNumInputs ?? 4
     }
-    
+
     public var altupCorrectScale: Bool {
         _altupCorrectScale ?? true
     }
-    
+
     public var hiddenSizePerLayerInput: Int {
         _hiddenSizePerLayerInput ?? 1024
     }
-    
+
     public var ropeLocalBaseFreq: Float {
         _ropeLocalBaseFreq ?? 10000.0
     }
-    
+
     public var ropeTraditional: Bool {
         _ropeTraditional ?? false
     }
-    
+
     public var ropeTheta: Float {
         _ropeTheta ?? 1000000.0
     }
-    
+
     public var queryPreAttnScalar: Float {
         _queryPreAttnScalar ?? 0.0625
     }
-    
+
     public var slidingWindow: Int {
         _slidingWindow ?? 1024
     }
-    
+
     public var mmTokensPerImage: Int {
         _mmTokensPerImage ?? 256
     }
-    
+
     public var slidingWindowPattern: Int {
         _slidingWindowPattern ?? 5
     }
-    
+
     public var finalLogitSoftcapping: Float {
         _finalLogitSoftcapping ?? 30.0
     }
-    
+
     public var queryRescaleScalar: Float {
         _queryRescaleScalar ?? 1.0
     }
-    
+
     public var numKvSharedLayers: Int {
         _numKvSharedLayers ?? 0
     }
-    
+
     public var maxPositionEmbeddings: Int {
         _maxPositionEmbeddings ?? 32768
     }
-    
+
     public var attnLogitSoftcapping: Float {
         _attnLogitSoftcapping ?? 0.0
     }
 
-
-
     enum CodingKeys: String, CodingKey {
         case modelType = "model_type"
         case hiddenSize = "hidden_size"
@@ -283,7 +281,7 @@ public struct ModelConfig: Codable, Sendable {
     public let visionConfig: VisionConfig
     public let audioConfig: AudioConfig
     public let modelType: String
-    
+
     // Fields with default values (can be overridden from JSON)
     private let _vocabSize: Int?
     private let _ignoreIndex: Int?
@@ -294,47 +292,47 @@ public struct ModelConfig: Codable, Sendable {
     private let _padTokenId: Int?
     private let _visionSoftTokensPerImage: Int?
     private let _audioSoftTokensPerImage: Int?
-    
+
     // Optional field
     public let eosTokenId: [Int]?
-    
+
     // Computed properties with defaults
     public var vocabSize: Int {
         _vocabSize ?? 257152
     }
-    
+
     public var ignoreIndex: Int {
         _ignoreIndex ?? -100
     }
-    
+
     public var imageTokenIndex: Int {
         _imageTokenIndex ?? 262145
     }
-    
+
     public var audioTokenId: Int {
         _audioTokenId ?? 262273
     }
-    
+
     public var imageTokenId: Int {
         _imageTokenId ?? 262145
     }
-    
+
     public var hiddenSize: Int {
         _hiddenSize ?? 2048
     }
-    
+
     public var padTokenId: Int {
         _padTokenId ?? 0
     }
-    
+
     public var visionSoftTokensPerImage: Int {
         _visionSoftTokensPerImage ?? 256
     }
-    
+
     public var audioSoftTokensPerImage: Int {
         _audioSoftTokensPerImage ?? 188
     }
-    
+
     // Custom initializer to allow manual construction
     public init(
         textConfig: TextConfig,
@@ -388,32 +386,45 @@ public struct ModelConfig: Codable, Sendable {
 
 // MARK: - Language Model Components
 
-private class Gemma3nRMSNorm: Module, UnaryLayer {
+// Base protocol for RMSNorm variants
+private protocol Gemma3nRMSNormProtocol: UnaryLayer {
+    func callAsFunction(_ x: MLXArray) -> MLXArray
+}
+
+// RMSNorm with scale parameter
+private class Gemma3nRMSNormWithScale: Module, Gemma3nRMSNormProtocol {
     let eps: Float
     let scaleShift: Float
-    let withScale: Bool
-    @ModuleInfo var weight: MLXArray?
+    @ModuleInfo var weight: MLXArray
 
-    init(dim: Int, eps: Float = 1e-6, scaleShift: Float = 0.0, withScale: Bool = true) {
+    init(dim: Int, eps: Float = 1e-6, scaleShift: Float = 0.0) {
         self.eps = eps
         self.scaleShift = scaleShift
-        self.withScale = withScale
-
-        if withScale {
-            self._weight.wrappedValue = MLXArray.ones([dim])
-        } else {
-            self._weight.wrappedValue = nil
-        }
+        self._weight.wrappedValue = MLXArray.ones([dim])
         super.init()
     }
 
     func callAsFunction(_ x: MLXArray) -> MLXArray {
         let output = norm(x.asType(.float32))
+        return (output * (weight + scaleShift)).asType(x.dtype)
+    }
 
-        if withScale, let weight = weight {
-            return (output * (weight + scaleShift)).asType(x.dtype)
-        }
+    private func norm(_ x: MLXArray) -> MLXArray {
+        return x * rsqrt(x.square().mean(axis: -1, keepDims: true) + eps)
+    }
+}
+
+// RMSNorm without scale parameter (no weight to load from checkpoint)
+private class Gemma3nRMSNormNoScale: Module, Gemma3nRMSNormProtocol {
+    let eps: Float
+
+    init(dim: Int, eps: Float = 1e-6) {
+        self.eps = eps
+        super.init()
+    }
 
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        let output = norm(x.asType(.float32))
         return output.asType(x.dtype)
     }
 
@@ -422,19 +433,32 @@ private class Gemma3nRMSNorm: Module, UnaryLayer {
     }
 }
 
+// Factory function to create the appropriate RMSNorm variant
+private func createGemma3nRMSNorm(
+    dim: Int,
+    eps: Float = 1e-6,
+    scaleShift: Float = 0.0,
+    withScale: Bool = true
+) -> any Gemma3nRMSNormProtocol {
+    if withScale {
+        return Gemma3nRMSNormWithScale(dim: dim, eps: eps, scaleShift: scaleShift)
+    } else {
+        return Gemma3nRMSNormNoScale(dim: dim, eps: eps)
+    }
+}
+
 private class Gemma3nLaurelBlock: Module {
     @ModuleInfo var linearLeft: Linear
     @ModuleInfo var linearRight: Linear
-    @ModuleInfo var postLaurelNorm: Gemma3nRMSNorm
+    @ModuleInfo var postLaurelNorm: Gemma3nRMSNormWithScale
 
     init(config: TextConfig) {
         self._linearLeft.wrappedValue = Linear(config.hiddenSize, config.laurelRank, bias: false)
         self._linearRight.wrappedValue = Linear(config.laurelRank, config.hiddenSize, bias: false)
-        self._postLaurelNorm.wrappedValue = Gemma3nRMSNorm(
+        self._postLaurelNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: config.hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
         super.init()
     }
@@ -471,8 +495,8 @@ private class Gemma3nRotaryEmbedding: Module {
     let originalMaxSeqLen: Int
     let config: TextConfig
     let attentionScaling: Float
-    @ModuleInfo var invFreq: MLXArray
-    @ModuleInfo var originalInvFreq: MLXArray
+    let invFreq: MLXArray
+    let originalInvFreq: MLXArray
 
     init(config: TextConfig) {
         if let ropeScaling = config.ropeScaling {
@@ -492,8 +516,8 @@ private class Gemma3nRotaryEmbedding: Module {
         self.attentionScaling = 1.0
 
         let (invFreq, _) = Self.computeDefaultRopeParameters(config: config)
-        self._invFreq.wrappedValue = MLXArray(invFreq).asType(.float32)
-        self._originalInvFreq.wrappedValue = MLXArray(invFreq).asType(.float32)
+        self.invFreq = MLXArray(invFreq).asType(.float32)
+        self.originalInvFreq = MLXArray(invFreq).asType(.float32)
 
         super.init()
     }
@@ -546,12 +570,15 @@ private class Gemma3nAttention: Module {
     @ModuleInfo var kProj: Linear
     @ModuleInfo var vProj: Linear
     @ModuleInfo var oProj: Linear
-    @ModuleInfo var qNorm: Gemma3nRMSNorm
-    @ModuleInfo var kNorm: Gemma3nRMSNorm
-    @ModuleInfo var vNorm: Gemma3nRMSNorm
+    @ModuleInfo var qNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo var kNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo var vNorm: Gemma3nRMSNormNoScale
 
     init(config: TextConfig, layerIdx: Int) {
-        self.isSliding = (config.layerTypes ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[layerIdx] == "sliding_attention"
+        self.isSliding =
+            (config.layerTypes
+            ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[layerIdx]
+            == "sliding_attention"
         self.attnLogitSoftcapping = config.attnLogitSoftcapping
 
         let dim = config.hiddenSize
@@ -567,12 +594,13 @@ private class Gemma3nAttention: Module {
         self._vProj.wrappedValue = Linear(dim, numKVHeads * headDim, bias: false)
         self._oProj.wrappedValue = Linear(numHeads * headDim, dim, bias: false)
 
-        self._qNorm.wrappedValue = Gemma3nRMSNorm(dim: config.headDim, eps: config.rmsNormEps)
-        self._kNorm.wrappedValue = Gemma3nRMSNorm(dim: config.headDim, eps: config.rmsNormEps)
-        self._vNorm.wrappedValue = Gemma3nRMSNorm(
+        self._qNorm.wrappedValue = Gemma3nRMSNormWithScale(
+            dim: config.headDim, eps: config.rmsNormEps)
+        self._kNorm.wrappedValue = Gemma3nRMSNormWithScale(
+            dim: config.headDim, eps: config.rmsNormEps)
+        self._vNorm.wrappedValue = Gemma3nRMSNormNoScale(
             dim: config.headDim,
-            eps: config.rmsNormEps,
-            withScale: false
+            eps: config.rmsNormEps
         )
 
         let firstKvSharedLayerIdx = config.numHiddenLayers - config.numKvSharedLayers
@@ -721,7 +749,7 @@ private class Gemma3nAltUp: Module {
     @ModuleInfo var correctionCoefs: Linear
     @ModuleInfo var predictionCoefs: Linear
     @ModuleInfo var modalityRouter: Linear
-    @ModuleInfo var routerNorm: Gemma3nRMSNorm
+    @ModuleInfo var routerNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var routerInputScale: MLXArray
 
     let config: TextConfig
@@ -745,11 +773,10 @@ private class Gemma3nAltUp: Module {
             config.altupNumInputs,
             bias: false
         )
-        self._routerNorm.wrappedValue = Gemma3nRMSNorm(
+        self._routerNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: config.hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
         self._routerInputScale.wrappedValue = MLXArray(pow(Float(config.hiddenSize), -1.0))
 
@@ -758,7 +785,7 @@ private class Gemma3nAltUp: Module {
 
     func computeRouterModalities(_ x: MLXArray) -> MLXArray {
         let routerInputs =
-            routerNorm(x) * routerInputScale.asType(routerNorm.weight?.dtype ?? x.dtype)
+            routerNorm(x) * routerInputScale.asType(routerNorm.weight.dtype)
         let routed = modalityRouter(routerInputs).asType(.float32)
         return tanh(routed)
     }
@@ -848,15 +875,15 @@ private class Gemma3nDecoderLayer: Module {
 
     @ModuleInfo var selfAttn: Gemma3nAttention
     @ModuleInfo var mlp: MLP
-    @ModuleInfo var inputLayernorm: Gemma3nRMSNorm
-    @ModuleInfo var postAttentionLayernorm: Gemma3nRMSNorm
-    @ModuleInfo var preFeedforwardLayernorm: Gemma3nRMSNorm
-    @ModuleInfo var postFeedforwardLayernorm: Gemma3nRMSNorm
+    @ModuleInfo var inputLayernorm: Gemma3nRMSNormWithScale
+    @ModuleInfo var postAttentionLayernorm: Gemma3nRMSNormWithScale
+    @ModuleInfo var preFeedforwardLayernorm: Gemma3nRMSNormWithScale
+    @ModuleInfo var postFeedforwardLayernorm: Gemma3nRMSNormWithScale
     @ModuleInfo var altup: Gemma3nAltUp
     @ModuleInfo var laurel: Gemma3nLaurelBlock
     @ModuleInfo var perLayerInputGate: Linear
     @ModuleInfo var perLayerProjection: Linear
-    @ModuleInfo var postPerLayerInputNorm: Gemma3nRMSNorm
+    @ModuleInfo var postPerLayerInputNorm: Gemma3nRMSNormWithScale
 
     init(config: TextConfig, layerIdx: Int) {
         self.config = config
@@ -866,33 +893,32 @@ private class Gemma3nDecoderLayer: Module {
         self.hiddenSizePerLayerInput = config.hiddenSizePerLayerInput
 
         self._selfAttn.wrappedValue = Gemma3nAttention(config: config, layerIdx: layerIdx)
-        self.isSliding = (config.layerTypes ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[layerIdx] == "sliding_attention"
+        self.isSliding =
+            (config.layerTypes
+            ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[layerIdx]
+            == "sliding_attention"
 
         self._mlp.wrappedValue = MLP(config: config, layerIdx: layerIdx)
-        self._inputLayernorm.wrappedValue = Gemma3nRMSNorm(
+        self._inputLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
 
-        self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNorm(
+        self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
-        self._preFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(
+        self._preFeedforwardLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
-        self._postFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(
+        self._postFeedforwardLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
 
         self._altup.wrappedValue = Gemma3nAltUp(config: config)
@@ -908,11 +934,10 @@ private class Gemma3nDecoderLayer: Module {
             hiddenSize,
             bias: false
         )
-        self._postPerLayerInputNorm.wrappedValue = Gemma3nRMSNorm(
+        self._postPerLayerInputNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
 
         super.init()
@@ -1037,10 +1062,10 @@ private class Gemma3Model: Module {
     @ModuleInfo var layers: [Gemma3nDecoderLayer]
     @ModuleInfo var embedTokensPerLayer: Gemma3nTextScaledWordEmbedding
     @ModuleInfo var perLayerModelProjection: Linear
-    @ModuleInfo var perLayerProjectionNorm: Gemma3nRMSNorm
+    @ModuleInfo var perLayerProjectionNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var altupProjections: [Linear]
     @ModuleInfo var altupUnembedProjections: [Linear]
-    @ModuleInfo var norm: Gemma3nRMSNorm
+    @ModuleInfo var norm: Gemma3nRMSNormWithScale
     @ModuleInfo var ropeEmbedding: Gemma3nRotaryEmbedding
     @ModuleInfo var ropeEmbeddingLocal: Gemma3nRotaryEmbedding
 
@@ -1075,11 +1100,10 @@ private class Gemma3Model: Module {
             bias: false
         )
 
-        self._perLayerProjectionNorm.wrappedValue = Gemma3nRMSNorm(
+        self._perLayerProjectionNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: config.hiddenSizePerLayerInput,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
 
         self._altupProjections.wrappedValue = (1 ..< config.altupNumInputs).map { _ in
@@ -1090,11 +1114,10 @@ private class Gemma3Model: Module {
             Linear(config.hiddenSize, config.hiddenSize, bias: false)
         }
 
-        self._norm.wrappedValue = Gemma3nRMSNorm(
+        self._norm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: config.hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0,
-            withScale: true
+            scaleShift: 0.0
         )
 
         self.perLayerProjectionScale = MLXArray(pow(Float(hiddenSize), -0.5))
@@ -1178,7 +1201,10 @@ private class Gemma3Model: Module {
         for (i, (layer, c)) in zip(layers[..<config.numHiddenLayers], cacheArray).enumerated() {
             let perLayerInput = finalPerLayerInputs[0..., 0..., i, 0...]
 
-            let isGlobal = (config.layerTypes ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[i] == "global_attention"
+            let isGlobal =
+                (config.layerTypes
+                ?? Array(repeating: "global_attention", count: config.numHiddenLayers))[i]
+                == "global_attention"
 
             let localMask: MLXFast.ScaledDotProductAttentionMaskMode
             if let mask = mask {
@@ -1322,17 +1348,23 @@ private class LanguageModel: Module, KVCacheDimensionProvider {
         var sanitizedWeights = [String: MLXArray]()
 
         for (k, v) in weights {
+            // Skip rotary embedding inverse frequency weights (matches Python exactly)
             if k.contains("self_attn.rotary_emb.inv_freq") {
                 continue
-            } else if !k.contains("language_model.model") && !k.contains("language_model.lm_head") {
+            }
+            // Python logic: if "language_model.model" not in k and "language_model.lm_head" not in k:
+            else if !k.contains("language_model.model") && !k.contains("language_model.lm_head") {
                 let newKey = k.replacingOccurrences(
                     of: "language_model", with: "language_model.model")
                 sanitizedWeights[newKey] = v
-            } else {
+            }
+            // Otherwise, keep the key as is
+            else {
                 sanitizedWeights[k] = v
             }
         }
 
+        // If lm_head weight is missing, use embed_tokens weight as fallback (matches Python exactly)
         if sanitizedWeights["language_model.lm_head.weight"] == nil {
             let embedTokensKey = "language_model.model.embed_tokens.weight"
             if let embedWeight = sanitizedWeights[embedTokensKey] {
@@ -1354,10 +1386,10 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
     let textHiddenSize: Int
 
     @ModuleInfo var embedding: Embedding
-    @ModuleInfo var hardEmbeddingNorm: Gemma3nRMSNorm
-    @ModuleInfo var softEmbeddingNorm: Gemma3nRMSNorm
+    @ModuleInfo var hardEmbeddingNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo var softEmbeddingNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var embeddingProjection: Linear
-    @ModuleInfo var embeddingPostProjectionNorm: Gemma3nRMSNorm
+    @ModuleInfo var embeddingPostProjectionNorm: Gemma3nRMSNormNoScale
 
     init(multimodalConfig: any MultimodalConfig, textConfig: TextConfig) {
         self.multimodalHiddenSize = multimodalConfig.hiddenSize
@@ -1370,11 +1402,11 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
             embeddingCount: vocabSize,
             dimensions: multimodalHiddenSize
         )
-        self._hardEmbeddingNorm.wrappedValue = Gemma3nRMSNorm(
+        self._hardEmbeddingNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: multimodalHiddenSize,
             eps: eps
         )
-        self._softEmbeddingNorm.wrappedValue = Gemma3nRMSNorm(
+        self._softEmbeddingNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: multimodalHiddenSize,
             eps: eps
         )
@@ -1383,10 +1415,9 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
             textHiddenSize,
             bias: false
         )
-        self._embeddingPostProjectionNorm.wrappedValue = Gemma3nRMSNorm(
+        self._embeddingPostProjectionNorm.wrappedValue = Gemma3nRMSNormNoScale(
             dim: textHiddenSize,
-            eps: eps,
-            withScale: false
+            eps: eps
         )
 
         super.init()
@@ -1627,10 +1658,17 @@ private func maskedScatter(
 
 private func checkArrayShape(_ arr: MLXArray) -> Bool {
     let shape = arr.shape
-    guard shape.count == 4 else { return false }
+    guard shape.count == 4 else {
+        print("🔍 checkArrayShape: Array has \(shape.count) dimensions, not 4")
+        return false
+    }
 
     let (outChannels, kH, kW, _) = (shape[0], shape[1], shape[2], shape[3])
-    return (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
+    let result = (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
+    print(
+        "🔍 checkArrayShape: shape=\(shape), outChannels=\(outChannels), kH=\(kH), kW=\(kW), result=\(result)"
+    )
+    return result
 }
 
 // MARK: - Main Model
@@ -1707,7 +1745,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
 
             if visionMask.any().item() {
                 let visionTokens = MLX.where(visionMask, inputIds, MLXArray.zeros(like: inputIds))
-                let visionEmbedsFlat = embedVision(visionTokens)
+                let visionEmbedsFlat = embedVision.callAsFunction(visionTokens, inputsEmbeds: nil)
                 inputsEmbeds = MLX.where(
                     expandedDimensions(visionMask, axis: -1),
                     visionEmbedsFlat,
@@ -1722,7 +1760,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
 
             if audioMask.any().item() {
                 let audioTokens = MLX.where(audioMask, inputIds, MLXArray.zeros(like: inputIds))
-                let audioEmbedsFlat = embedAudio(audioTokens)
+                let audioEmbedsFlat = embedAudio.callAsFunction(audioTokens, inputsEmbeds: nil)
                 inputsEmbeds = MLX.where(
                     expandedDimensions(audioMask, axis: -1),
                     audioEmbedsFlat,
@@ -1749,7 +1787,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         if let inputFeatures = inputFeatures, let inputFeaturesMask = inputFeaturesMask {
             let (audioFeatures, audioMask) = getAudioFeatures(inputFeatures, .!inputFeaturesMask)
             let audioPaddingIds = MLXArray([config.vocabSize - 1]).expandedDimensions(axis: 0)
-            let audioPaddingEmbs = embedAudio(audioPaddingIds)
+            let audioPaddingEmbs = embedAudio.callAsFunction(audioPaddingIds, inputsEmbeds: nil)
 
             let maskedAudioFeatures = MLX.where(
                 expandedDimensions(audioMask, axis: -1),
@@ -1786,7 +1824,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         MLXArray, MLXArray
     ) {
         let (audioOutputs, audioMask) = audioTower(inputFeatures, inputFeaturesMask)
-        return (embedAudio(nil, inputsEmbeds: audioOutputs), audioMask)
+        return (embedAudio.callAsFunction(nil, inputsEmbeds: audioOutputs), audioMask)
     }
 
     func getImageFeatures(_ pixelValues: MLXArray) -> MLXArray {
@@ -1804,7 +1842,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
 
         // Normalize and embed the soft tokens into language model space
         let scaledOutputs = reshaped * pow(Float(config.visionConfig.hiddenSize), 0.5)
-        return embedVision(nil, inputsEmbeds: scaledOutputs)
+        return embedVision.callAsFunction(nil, inputsEmbeds: scaledOutputs)
     }
 
     func mergeMultimodalAndText(
@@ -1822,7 +1860,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
             // When inputIds is nil, create mask by comparing embeddings
             let embedFn: (MLXArray) -> MLXArray =
                 modality == "audio"
-                ? { self.embedAudio($0, inputsEmbeds: nil) }
+                ? { self.embedAudio.callAsFunction($0, inputsEmbeds: nil) }
                 : { self.languageModel.model.embedTokens($0) }
             let tokenEmbedding = embedFn(MLXArray([tokenId]))
             specialModalityMask = inputsEmbeds .== tokenEmbedding
@@ -1877,25 +1915,63 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
     }
 
     public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
-        var processedWeights = languageModel.sanitize(weights: weights)
-        processedWeights = visionTower.sanitize(weights: processedWeights)
-        processedWeights = audioTower.sanitize(weights: processedWeights)
+        print("🔍 Gemma3n.sanitize: Starting with \(weights.count) weights")
 
         var sanitizedWeights = [String: MLXArray]()
-        for (k, v) in processedWeights {
+
+        // Main model sanitization - remove "model." prefix
+        for (k, v) in weights {
             if k.hasPrefix("model.") {
-                sanitizedWeights[String(k.dropFirst(6))] = v
+                // Python: ".".join(k.split(".")[1:]) -> remove first component, join rest
+                let components = k.split(separator: ".")
+                if components.count > 1 {
+                    let newKey = components.dropFirst().joined(separator: ".")
+                    sanitizedWeights[newKey] = v
+                } else {
+                    sanitizedWeights[k] = v
+                }
             } else {
                 sanitizedWeights[k] = v
             }
         }
 
+        print("🔍 Gemma3n.sanitize: After main sanitization, have \(sanitizedWeights.count) weights")
+
+        // Apply vision model sanitization using static method (matches Python from_pretrained exactly)
+        sanitizedWeights = Gemma3nVisionModel.sanitizeWeights(sanitizedWeights)
+
+        print(
+            "🔍 Gemma3n.sanitize: After vision sanitization, have \(sanitizedWeights.count) weights")
+
+        // Apply audio model sanitization for Conv2d and Conv1d layers
+        sanitizedWeights = Gemma3nAudioModel.sanitizeWeights(sanitizedWeights)
+        print(
+            "🔍 Gemma3n.sanitize: After audio sanitization, have \(sanitizedWeights.count) weights")
+
+        // Debug: Print embedding-related weight keys
+        let embeddingKeys = sanitizedWeights.keys.filter { $0.contains("embed") }
+        print("🔍 Gemma3n.sanitize: Found \(embeddingKeys.count) embedding-related keys:")
+        for key in embeddingKeys.sorted() {
+            print("🔍   - \(key)")
+        }
+
+        // Debug: Print RMSNorm-related weight keys
+        let rmsnormKeys = sanitizedWeights.keys.filter {
+            $0.contains("norm") && $0.contains("weight")
+        }
+        print("🔍 Gemma3n.sanitize: Found \(rmsnormKeys.count) RMSNorm-related keys:")
+        for key in rmsnormKeys.sorted() {
+            print("🔍   - \(key)")
+        }
+
         return sanitizedWeights
     }
 
     public static func fromPretrained(pathOrHfRepo: String) throws -> Gemma3n {
         let path = URL(fileURLWithPath: pathOrHfRepo)
 
+        print("🔍 Gemma3n.fromPretrained: Loading from \(pathOrHfRepo)")
+
         // Load config
         let configPath = path.appendingPathComponent("config.json")
         let configData = try Data(contentsOf: configPath)
@@ -1926,6 +2002,8 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         let weightFiles = try FileManager.default.contentsOfDirectory(atPath: path.path)
             .filter { $0.hasSuffix(".safetensors") }
 
+        print("🔍 Gemma3n.fromPretrained: Found \(weightFiles.count) weight files: \(weightFiles)")
+
         guard !weightFiles.isEmpty else {
             throw NSError(
                 domain: "ModelLoading", code: 1,
@@ -1935,13 +2013,24 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         var weights = [String: MLXArray]()
         for weightFile in weightFiles {
             let weightPath = path.appendingPathComponent(weightFile)
+            print("🔍 Gemma3n.fromPretrained: Loading weights from \(weightFile)")
             let fileWeights = try loadArrays(url: weightPath)
+            print(
+                "🔍 Gemma3n.fromPretrained: Loaded \(fileWeights.count) weights from \(weightFile)")
             weights.merge(fileWeights) { _, new in new }
         }
 
-        var sanitizedWeights = model.sanitize(weights: weights)
-        sanitizedWeights = model.visionTower.sanitize(weights: sanitizedWeights)
+        print("🔍 Gemma3n.fromPretrained: Total weights loaded: \(weights.count)")
+
+        // Print some sample weight keys to understand the structure
+        let sampleKeys = Array(weights.keys.prefix(10))
+        print("🔍 Gemma3n.fromPretrained: Sample weight keys: \(sampleKeys)")
+
+        let sanitizedWeights = model.sanitize(weights: weights)
+
+        print("🔍 Gemma3n.fromPretrained: Attempting to update model parameters...")
         try model.update(parameters: ModuleParameters.unflattened(sanitizedWeights), verify: [.all])
+        print("🔍 Gemma3n.fromPretrained: Model parameters updated successfully!")
 
         return model
     }
@@ -2578,10 +2667,10 @@ private class Gemma3nAudioConformerAttention: Module {
     let postInFeatures: Int
     let gradientClipping: MLXArray
 
-    @ModuleInfo var preAttnNorm: Gemma3nRMSNorm
+    @ModuleInfo var preAttnNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var attn: Gemma3nAudioAttention
     @ModuleInfo var post: Linear
-    @ModuleInfo var postNorm: Gemma3nRMSNorm
+    @ModuleInfo var postNorm: Gemma3nRMSNormWithScale
 
     init(config: AudioConfig) {
         self.config = config
@@ -2589,10 +2678,10 @@ private class Gemma3nAudioConformerAttention: Module {
         self.postInFeatures = config.hiddenSize
         self.gradientClipping = MLXArray(config.gradientClipping)
 
-        self._preAttnNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._preAttnNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
         self._attn.wrappedValue = Gemma3nAudioAttention(config: config)
         self._post.wrappedValue = Linear(postInFeatures, config.hiddenSize, bias: false)
-        self._postNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._postNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
 
         super.init()
     }
@@ -2621,20 +2710,20 @@ private class Gemma3nAudioConformerFeedForward: Module {
     let gradientClipping: MLXArray
     let postLayerScale: MLXArray
 
-    @ModuleInfo var preLayerNorm: Gemma3nRMSNorm
+    @ModuleInfo var preLayerNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var ffwLayer1: Linear
     @ModuleInfo var ffwLayer2: Linear
-    @ModuleInfo var postLayerNorm: Gemma3nRMSNorm
+    @ModuleInfo var postLayerNorm: Gemma3nRMSNormWithScale
 
     init(config: AudioConfig) {
         self.config = config
         self.gradientClipping = MLXArray(config.gradientClipping)
         self.postLayerScale = MLXArray(config.confResidualWeight)
 
-        self._preLayerNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._preLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
         self._ffwLayer1.wrappedValue = Linear(config.hiddenSize, config.hiddenSize * 4, bias: false)
         self._ffwLayer2.wrappedValue = Linear(config.hiddenSize * 4, config.hiddenSize, bias: false)
-        self._postLayerNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._postLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
 
         super.init()
     }
@@ -2658,10 +2747,10 @@ private class Gemma3nAudioConformerLightConv1d: Module {
     let gradientClipping: MLXArray
     let causalPadding: Int
 
-    @ModuleInfo var preLayerNorm: Gemma3nRMSNorm
+    @ModuleInfo var preLayerNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var linearStart: Linear
     @ModuleInfo var depthwiseConv1d: Conv1d
-    @ModuleInfo var convNorm: Gemma3nRMSNorm
+    @ModuleInfo var convNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var linearEnd: Linear
 
     init(config: AudioConfig) {
@@ -2669,7 +2758,7 @@ private class Gemma3nAudioConformerLightConv1d: Module {
         self.gradientClipping = MLXArray(config.gradientClipping)
         self.causalPadding = config.confConvKernelSize - 1
 
-        self._preLayerNorm.wrappedValue = Gemma3nRMSNorm(
+        self._preLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: config.hiddenSize,
             eps: config.rmsNormEps
         )
@@ -2687,7 +2776,7 @@ private class Gemma3nAudioConformerLightConv1d: Module {
             groups: config.hiddenSize,
             bias: false
         )
-        self._convNorm.wrappedValue = Gemma3nRMSNorm(
+        self._convNorm.wrappedValue = Gemma3nRMSNormWithScale(
             dim: config.hiddenSize,
             eps: config.rmsNormEps
         )
@@ -2730,7 +2819,7 @@ private class Gemma3nAudioConformerBlock: Module {
     @ModuleInfo var attention: Gemma3nAudioConformerAttention
     @ModuleInfo var lconv1d: Gemma3nAudioConformerLightConv1d
     @ModuleInfo var ffwLayerEnd: Gemma3nAudioConformerFeedForward
-    @ModuleInfo var norm: Gemma3nRMSNorm
+    @ModuleInfo var norm: Gemma3nRMSNormWithScale
 
     init(config: AudioConfig) {
         self.config = config
@@ -2740,7 +2829,7 @@ private class Gemma3nAudioConformerBlock: Module {
         self._attention.wrappedValue = Gemma3nAudioConformerAttention(config: config)
         self._lconv1d.wrappedValue = Gemma3nAudioConformerLightConv1d(config: config)
         self._ffwLayerEnd.wrappedValue = Gemma3nAudioConformerFeedForward(config: config)
-        self._norm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
+        self._norm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
 
         super.init()
     }
@@ -3775,19 +3864,37 @@ private class Gemma3nVisionModel: Module {
     }
 
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        return Self.sanitizeWeights(weights)
+    }
+
+    static func sanitizeWeights(_ weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
         var skipTranspose = false
 
-        // Check if weights are already in MLX format
-        if let convWeight = weights["vision_tower.timm_model.blocks.0.0.conv_exp.weight"] {
+        print("🔍 VisionModel.sanitize: Starting with \(weights.count) weights")
+
+        // Match Python exactly: use the specific key it expects
+        let testKey = "vision_tower.timm_model.blocks.0.0.conv_exp.weight"
+        if let convWeight = weights[testKey] {
             let (_, H, _, C) = (
                 convWeight.shape[0], convWeight.shape[1], convWeight.shape[2], convWeight.shape[3]
             )
+            print(
+                "🔍 VisionModel.sanitize: Found test key '\(testKey)' with shape \(convWeight.shape), H=\(H), C=\(C)"
+            )
             if C > H {
                 skipTranspose = true
+                print(
+                    "🔍 VisionModel.sanitize: Setting skipTranspose=true because C(\(C)) > H(\(H))")
             }
+        } else {
+            print(
+                "🔍 VisionModel.sanitize: WARNING - Expected test key '\(testKey)' not found in weights!"
+            )
         }
 
+        print("🔍 VisionModel.sanitize: skipTranspose=\(skipTranspose)")
+
         for (k, v) in weights {
             // PyTorch conv2d weight: [out_channels, in_channels, kH, kW]
             // MLX conv2d weight: [out_channels, kH, KW, in_channels]
@@ -3795,8 +3902,14 @@ private class Gemma3nVisionModel: Module {
                 || (k.contains("attn") && k.contains("proj.weight"))
             {
                 if v.shape.count == 4 && !skipTranspose {
+                    print(
+                        "🔍 VisionModel.sanitize: Transposing '\(k)' from \(v.shape) to \(v.transposed(0, 2, 3, 1).shape)"
+                    )
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 } else {
+                    print(
+                        "🔍 VisionModel.sanitize: Keeping '\(k)' as-is with shape \(v.shape) (skipTranspose=\(skipTranspose))"
+                    )
                     sanitizedWeights[k] = v
                 }
             } else {
@@ -3804,6 +3917,7 @@ private class Gemma3nVisionModel: Module {
             }
         }
 
+        print("🔍 VisionModel.sanitize: Completed with \(sanitizedWeights.count) weights")
         return sanitizedWeights
     }
 }
@@ -3906,17 +4020,70 @@ private class Gemma3nAudioModel: Module {
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
 
+        print("🔍 AudioModel.sanitize: Starting with \(weights.count) weights")
+
+        for (k, v) in weights {
+            if k.contains("conv.weight") {
+                if checkArrayShape(v) {
+                    print(
+                        "🔍 AudioModel.sanitize: Keeping conv weight '\(k)' as-is with shape \(v.shape)"
+                    )
+                    sanitizedWeights[k] = v
+                } else {
+                    print(
+                        "🔍 AudioModel.sanitize: Transposing conv weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 3, 1).shape)"
+                    )
+                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
+                }
+            } else if k.contains("conv1d.weight") {
+                if checkArrayShape(v) {
+                    print(
+                        "🔍 AudioModel.sanitize: Keeping conv1d weight '\(k)' as-is with shape \(v.shape)"
+                    )
+                    sanitizedWeights[k] = v
+                } else {
+                    print(
+                        "🔍 AudioModel.sanitize: Transposing conv1d weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 1).shape)"
+                    )
+                    sanitizedWeights[k] = v.transposed(0, 2, 1)
+                }
+            } else {
+                sanitizedWeights[k] = v
+            }
+        }
+
+        print("🔍 AudioModel.sanitize: Completed with \(sanitizedWeights.count) weights")
+        return sanitizedWeights
+    }
+
+    static func sanitizeWeights(_ weights: [String: MLXArray]) -> [String: MLXArray] {
+        var sanitizedWeights = [String: MLXArray]()
+
+        print("🔍 AudioModel.sanitizeWeights: Starting with \(weights.count) weights")
+
         for (k, v) in weights {
             if k.contains("conv.weight") {
                 if checkArrayShape(v) {
+                    print(
+                        "🔍 AudioModel.sanitizeWeights: Keeping conv weight '\(k)' as-is with shape \(v.shape)"
+                    )
                     sanitizedWeights[k] = v
                 } else {
+                    print(
+                        "🔍 AudioModel.sanitizeWeights: Transposing conv weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 3, 1).shape)"
+                    )
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 }
             } else if k.contains("conv1d.weight") {
                 if checkArrayShape(v) {
+                    print(
+                        "🔍 AudioModel.sanitizeWeights: Keeping conv1d weight '\(k)' as-is with shape \(v.shape)"
+                    )
                     sanitizedWeights[k] = v
                 } else {
+                    print(
+                        "🔍 AudioModel.sanitizeWeights: Transposing conv1d weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 1).shape)"
+                    )
                     sanitizedWeights[k] = v.transposed(0, 2, 1)
                 }
             } else {
@@ -3924,6 +4091,7 @@ private class Gemma3nAudioModel: Module {
             }
         }
 
+        print("🔍 AudioModel.sanitizeWeights: Completed with \(sanitizedWeights.count) weights")
         return sanitizedWeights
     }
 }
@@ -3961,35 +4129,35 @@ public struct Gemma3nConfiguration: Codable, Sendable {
     public var vocabSize: Int {
         _vocabSize ?? 257152
     }
-    
+
     public var ignoreIndex: Int {
         _ignoreIndex ?? -100
     }
-    
+
     public var imageTokenIndex: Int {
         _imageTokenIndex ?? 262145
     }
-    
+
     public var audioTokenId: Int {
         _audioTokenId ?? 262273
     }
-    
+
     public var imageTokenId: Int {
         _imageTokenId ?? 262145
     }
-    
+
     public var hiddenSize: Int {
         _hiddenSize ?? 2048
     }
-    
+
     public var padTokenId: Int {
         _padTokenId ?? 0
     }
-    
+
     public var visionSoftTokensPerImage: Int {
         _visionSoftTokensPerImage ?? 256
     }
-    
+
     public var audioSoftTokensPerImage: Int {
         _audioSoftTokensPerImage ?? 188
     }

From e417d7eab4d1be294e3b746358631fbcb9f9b02b Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Fri, 27 Jun 2025 18:23:23 +0200
Subject: [PATCH 05/19] Fix sanitization, still have problems with loading

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 397 +++++++++-----------------
 1 file changed, 139 insertions(+), 258 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index cc1f0dd5..e8a600c0 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -448,9 +448,9 @@ private func createGemma3nRMSNorm(
 }
 
 private class Gemma3nLaurelBlock: Module {
-    @ModuleInfo var linearLeft: Linear
-    @ModuleInfo var linearRight: Linear
-    @ModuleInfo var postLaurelNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "linear_left") var linearLeft: Linear
+    @ModuleInfo(key: "linear_right") var linearRight: Linear
+    @ModuleInfo(key: "post_laurel_norm") var postLaurelNorm: Gemma3nRMSNormWithScale
 
     init(config: TextConfig) {
         self._linearLeft.wrappedValue = Linear(config.hiddenSize, config.laurelRank, bias: false)
@@ -566,13 +566,13 @@ private class Gemma3nAttention: Module {
     let isKvSharedLayer: Bool
     let kvSharedLayerIndex: Int?
 
-    @ModuleInfo var qProj: Linear
-    @ModuleInfo var kProj: Linear
-    @ModuleInfo var vProj: Linear
-    @ModuleInfo var oProj: Linear
-    @ModuleInfo var qNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var kNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var vNorm: Gemma3nRMSNormNoScale
+    @ModuleInfo(key: "q_proj") var qProj: Linear
+    @ModuleInfo(key: "k_proj") var kProj: Linear
+    @ModuleInfo(key: "v_proj") var vProj: Linear
+    @ModuleInfo(key: "o_proj") var oProj: Linear
+    @ModuleInfo(key: "q_norm") var qNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "k_norm") var kNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "v_norm") var vNorm: Gemma3nRMSNormNoScale
 
     init(config: TextConfig, layerIdx: Int) {
         self.isSliding =
@@ -697,9 +697,9 @@ private class Gemma3nAttention: Module {
 }
 
 private class MLP: Module, UnaryLayer {
-    @ModuleInfo var gateProj: Linear
-    @ModuleInfo var upProj: Linear
-    @ModuleInfo var downProj: Linear
+    @ModuleInfo(key: "gate_proj") var gateProj: Linear
+    @ModuleInfo(key: "up_proj") var upProj: Linear
+    @ModuleInfo(key: "down_proj") var downProj: Linear
 
     let config: TextConfig
     let activationSparsity: Float
@@ -745,12 +745,12 @@ private class MLP: Module, UnaryLayer {
 }
 
 private class Gemma3nAltUp: Module {
-    @ModuleInfo var correctOutputScale: MLXArray
-    @ModuleInfo var correctionCoefs: Linear
-    @ModuleInfo var predictionCoefs: Linear
-    @ModuleInfo var modalityRouter: Linear
-    @ModuleInfo var routerNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var routerInputScale: MLXArray
+    @ModuleInfo(key: "correct_output_scale") var correctOutputScale: MLXArray
+    @ModuleInfo(key: "correction_coefs") var correctionCoefs: Linear
+    @ModuleInfo(key: "prediction_coefs") var predictionCoefs: Linear
+    @ModuleInfo(key: "modality_router") var modalityRouter: Linear
+    @ModuleInfo(key: "router_norm") var routerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "router_input_scale") var routerInputScale: MLXArray
 
     let config: TextConfig
 
@@ -873,17 +873,19 @@ private class Gemma3nDecoderLayer: Module {
     let slidingWindow: Int
     let hiddenSizePerLayerInput: Int
 
-    @ModuleInfo var selfAttn: Gemma3nAttention
+    @ModuleInfo(key: "self_attn") var selfAttn: Gemma3nAttention
     @ModuleInfo var mlp: MLP
-    @ModuleInfo var inputLayernorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var postAttentionLayernorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var preFeedforwardLayernorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var postFeedforwardLayernorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "input_layernorm") var inputLayernorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayernorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayernorm:
+        Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayernorm:
+        Gemma3nRMSNormWithScale
     @ModuleInfo var altup: Gemma3nAltUp
     @ModuleInfo var laurel: Gemma3nLaurelBlock
-    @ModuleInfo var perLayerInputGate: Linear
-    @ModuleInfo var perLayerProjection: Linear
-    @ModuleInfo var postPerLayerInputNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "per_layer_input_gate") var perLayerInputGate: Linear
+    @ModuleInfo(key: "per_layer_projection") var perLayerProjection: Linear
+    @ModuleInfo(key: "post_per_layer_input_norm") var postPerLayerInputNorm: Gemma3nRMSNormWithScale
 
     init(config: TextConfig, layerIdx: Int) {
         self.config = config
@@ -1055,19 +1057,23 @@ private class Gemma3Model: Module {
     let vocabSize: Int
     let vocabSizePerLayerInput: Int
     let numHiddenLayers: Int
-    let perLayerProjectionScale: MLXArray
-    let perLayerInputScale: MLXArray
-
-    @ModuleInfo var embedTokens: Gemma3nTextScaledWordEmbedding
-    @ModuleInfo var layers: [Gemma3nDecoderLayer]
-    @ModuleInfo var embedTokensPerLayer: Gemma3nTextScaledWordEmbedding
-    @ModuleInfo var perLayerModelProjection: Linear
-    @ModuleInfo var perLayerProjectionNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var altupProjections: [Linear]
-    @ModuleInfo var altupUnembedProjections: [Linear]
+    private let perLayerProjectionScale: MLXArray
+    private let perLayerInputScale: MLXArray
+
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Gemma3nTextScaledWordEmbedding
+    @ModuleInfo(key: "layers") var layers: [Gemma3nDecoderLayer]  // This is correct!
+    @ModuleInfo(key: "embed_tokens_per_layer") var embedTokensPerLayer:
+        Gemma3nTextScaledWordEmbedding
+    @ModuleInfo(key: "per_layer_model_projection") var perLayerModelProjection: Linear
+    @ModuleInfo(key: "per_layer_projection_norm") var perLayerProjectionNorm:
+        Gemma3nRMSNormWithScale
+
+    @ModuleInfo(key: "altup_projections") var altupProjections: [Linear]
+    @ModuleInfo(key: "altup_unembed_projections") var altupUnembedProjections: [Linear]
+
     @ModuleInfo var norm: Gemma3nRMSNormWithScale
-    @ModuleInfo var ropeEmbedding: Gemma3nRotaryEmbedding
-    @ModuleInfo var ropeEmbeddingLocal: Gemma3nRotaryEmbedding
+    @ModuleInfo(key: "rope_embedding") var ropeEmbedding: Gemma3nRotaryEmbedding
+    @ModuleInfo(key: "rope_embedding_local") var ropeEmbeddingLocal: Gemma3nRotaryEmbedding
 
     init(config: TextConfig) {
         self.config = config
@@ -1106,11 +1112,10 @@ private class Gemma3Model: Module {
             scaleShift: 0.0
         )
 
-        self._altupProjections.wrappedValue = (1 ..< config.altupNumInputs).map { _ in
+        self._altupProjections.wrappedValue = (0 ..< (config.altupNumInputs - 1)).map { _ in
             Linear(config.hiddenSize, config.hiddenSize, bias: false)
         }
-
-        self._altupUnembedProjections.wrappedValue = (1 ..< config.altupNumInputs).map { _ in
+        self._altupUnembedProjections.wrappedValue = (0 ..< (config.altupNumInputs - 1)).map { _ in
             Linear(config.hiddenSize, config.hiddenSize, bias: false)
         }
 
@@ -1170,7 +1175,7 @@ private class Gemma3Model: Module {
 
         if mask == nil {
             let j = config.slidingWindowPattern
-            if j > 0 && j <= cacheArray.count {
+            if j > 0, j <= cacheArray.count {
                 let globalCacheSlice = Array(cacheArray[(j - 1) ..< j]).compactMap { $0 }
                 fullMask = createAttentionMask(h: h, cache: globalCacheSlice, returnArray: true)
             }
@@ -1190,6 +1195,7 @@ private class Gemma3Model: Module {
         var hList = Array(repeating: h0, count: config.altupNumInputs)
 
         for i in 1 ..< config.altupNumInputs {
+            // `i - 1` is used because altupProjections has `altupNumInputs - 1` elements.
             let altupProj = altupProjections[i - 1](hList[i])
             hList[i] = altupProj.asType(h0.dtype)
             let newMagnitude = pow(mean(hList[i].square(), axis: -1, keepDims: true), 0.5)
@@ -1198,7 +1204,7 @@ private class Gemma3Model: Module {
 
         h = stacked(hList, axis: 0)
 
-        for (i, (layer, c)) in zip(layers[..<config.numHiddenLayers], cacheArray).enumerated() {
+        for (i, (layer, c)) in zip(layers, cacheArray).enumerated() {
             let perLayerInput = finalPerLayerInputs[0..., 0..., i, 0...]
 
             let isGlobal =
@@ -1230,6 +1236,7 @@ private class Gemma3Model: Module {
         let targetMagnitudeFinal = pow(mean(h[0].square(), axis: -1, keepDims: true), 0.5)
 
         for i in 1 ..< config.altupNumInputs {
+            // `i - 1` is used because altupUnembedProjections has `altupNumInputs - 1` elements.
             let altupUnembProj = altupUnembedProjections[i - 1](h[i])
             h[i] = altupUnembProj.asType(h0.dtype)
             let newMagnitude = pow(mean(h[i].square(), axis: -1, keepDims: true), 0.5)
@@ -1237,7 +1244,6 @@ private class Gemma3Model: Module {
         }
 
         h = mean(h, axis: 0)
-
         return norm(h)
     }
 
@@ -1286,8 +1292,8 @@ private class LanguageModel: Module, KVCacheDimensionProvider {
     let finalLogitSoftcapping: Float?
     let textVocabSize: Int
 
-    @ModuleInfo var model: Gemma3Model
-    @ModuleInfo var lmHead: Linear
+    @ModuleInfo(key: "model") var model: Gemma3Model
+    @ModuleInfo(key: "lm_head") var lmHead: Linear
 
     var kvHeads: [Int]
 
@@ -1386,10 +1392,11 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
     let textHiddenSize: Int
 
     @ModuleInfo var embedding: Embedding
-    @ModuleInfo var hardEmbeddingNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var softEmbeddingNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var embeddingProjection: Linear
-    @ModuleInfo var embeddingPostProjectionNorm: Gemma3nRMSNormNoScale
+    @ModuleInfo(key: "hard_embedding_norm") var hardEmbeddingNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "soft_embedding_norm") var softEmbeddingNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "embedding_projection") var embeddingProjection: Linear
+    @ModuleInfo(key: "embedding_post_projection_norm") var embeddingPostProjectionNorm:
+        Gemma3nRMSNormNoScale
 
     init(multimodalConfig: any MultimodalConfig, textConfig: TextConfig) {
         self.multimodalHiddenSize = multimodalConfig.hiddenSize
@@ -1674,11 +1681,11 @@ private func checkArrayShape(_ arr: MLXArray) -> Bool {
 // MARK: - Main Model
 
 public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
-    @ModuleInfo private var languageModel: LanguageModel
-    @ModuleInfo private var visionTower: Gemma3nVisionModel
-    @ModuleInfo private var audioTower: Gemma3nAudioModel
-    @ModuleInfo private var embedVision: Gemma3nMultimodalEmbedder
-    @ModuleInfo private var embedAudio: Gemma3nMultimodalEmbedder
+    @ModuleInfo(key: "language_model") private var languageModel: LanguageModel
+    @ModuleInfo(key: "vision_tower") private var visionTower: Gemma3nVisionModel
+    @ModuleInfo(key: "audio_tower") private var audioTower: Gemma3nAudioModel
+    @ModuleInfo(key: "embed_vision") private var embedVision: Gemma3nMultimodalEmbedder
+    @ModuleInfo(key: "embed_audio") private var embedAudio: Gemma3nMultimodalEmbedder
 
     public let config: ModelConfig
 
@@ -1914,123 +1921,75 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         return languageModel(inputs: inputs, cache: convertedCache).logits
     }
 
+    // In class Gemma3n
     public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         print("🔍 Gemma3n.sanitize: Starting with \(weights.count) weights")
-
         var sanitizedWeights = [String: MLXArray]()
 
-        // Main model sanitization - remove "model." prefix
+        // This function's ONLY job is to remove the "model." prefix from keys.
         for (k, v) in weights {
             if k.hasPrefix("model.") {
-                // Python: ".".join(k.split(".")[1:]) -> remove first component, join rest
-                let components = k.split(separator: ".")
-                if components.count > 1 {
-                    let newKey = components.dropFirst().joined(separator: ".")
-                    sanitizedWeights[newKey] = v
-                } else {
-                    sanitizedWeights[k] = v
-                }
+                let newKey = k.split(separator: ".").dropFirst().joined(separator: ".")
+                sanitizedWeights[newKey] = v
             } else {
                 sanitizedWeights[k] = v
             }
         }
 
-        print("🔍 Gemma3n.sanitize: After main sanitization, have \(sanitizedWeights.count) weights")
-
-        // Apply vision model sanitization using static method (matches Python from_pretrained exactly)
-        sanitizedWeights = Gemma3nVisionModel.sanitizeWeights(sanitizedWeights)
-
-        print(
-            "🔍 Gemma3n.sanitize: After vision sanitization, have \(sanitizedWeights.count) weights")
-
-        // Apply audio model sanitization for Conv2d and Conv1d layers
-        sanitizedWeights = Gemma3nAudioModel.sanitizeWeights(sanitizedWeights)
-        print(
-            "🔍 Gemma3n.sanitize: After audio sanitization, have \(sanitizedWeights.count) weights")
-
-        // Debug: Print embedding-related weight keys
-        let embeddingKeys = sanitizedWeights.keys.filter { $0.contains("embed") }
-        print("🔍 Gemma3n.sanitize: Found \(embeddingKeys.count) embedding-related keys:")
-        for key in embeddingKeys.sorted() {
-            print("🔍   - \(key)")
-        }
-
-        // Debug: Print RMSNorm-related weight keys
-        let rmsnormKeys = sanitizedWeights.keys.filter {
-            $0.contains("norm") && $0.contains("weight")
-        }
-        print("🔍 Gemma3n.sanitize: Found \(rmsnormKeys.count) RMSNorm-related keys:")
-        for key in rmsnormKeys.sorted() {
-            print("🔍   - \(key)")
-        }
-
+        print("🔍 Gemma3n.sanitize: After prefix removal, have \(sanitizedWeights.count) weights")
         return sanitizedWeights
     }
 
     public static func fromPretrained(pathOrHfRepo: String) throws -> Gemma3n {
         let path = URL(fileURLWithPath: pathOrHfRepo)
-
         print("🔍 Gemma3n.fromPretrained: Loading from \(pathOrHfRepo)")
 
-        // Load config
         let configPath = path.appendingPathComponent("config.json")
         let configData = try Data(contentsOf: configPath)
-        let configDict = try JSONSerialization.jsonObject(with: configData) as! [String: Any]
-
-        // Create nested configs
-        let textConfig = try JSONDecoder().decode(
-            TextConfig.self,
-            from: JSONSerialization.data(withJSONObject: configDict["text_config"]!))
-        let visionConfig = try JSONDecoder().decode(
-            VisionConfig.self,
-            from: JSONSerialization.data(withJSONObject: configDict["vision_config"]!))
-        let audioConfig = try JSONDecoder().decode(
-            AudioConfig.self,
-            from: JSONSerialization.data(withJSONObject: configDict["audio_config"]!))
 
-        let modelConfig = ModelConfig(
-            textConfig: textConfig,
-            visionConfig: visionConfig,
-            audioConfig: audioConfig,
-            modelType: configDict["model_type"] as? String ?? "gemma3n",
-            eosTokenId: configDict["eos_token_id"] as? [Int]
-        )
+        let decoder = JSONDecoder()
+        decoder.keyDecodingStrategy = .convertFromSnakeCase
+        let modelConfig = try decoder.decode(ModelConfig.self, from: configData)
 
         let model = Gemma3n(modelConfig)
 
-        // Load weights
+        // Load all weight files into a single dictionary
         let weightFiles = try FileManager.default.contentsOfDirectory(atPath: path.path)
             .filter { $0.hasSuffix(".safetensors") }
-
-        print("🔍 Gemma3n.fromPretrained: Found \(weightFiles.count) weight files: \(weightFiles)")
-
         guard !weightFiles.isEmpty else {
             throw NSError(
                 domain: "ModelLoading", code: 1,
-                userInfo: [NSLocalizedDescriptionKey: "No safetensors found"])
+                userInfo: [NSLocalizedDescriptionKey: "No safetensors found in \(path.path)"])
         }
 
         var weights = [String: MLXArray]()
         for weightFile in weightFiles {
-            let weightPath = path.appendingPathComponent(weightFile)
-            print("🔍 Gemma3n.fromPretrained: Loading weights from \(weightFile)")
-            let fileWeights = try loadArrays(url: weightPath)
-            print(
-                "🔍 Gemma3n.fromPretrained: Loaded \(fileWeights.count) weights from \(weightFile)")
+            let fileWeights = try loadArrays(url: path.appendingPathComponent(weightFile))
             weights.merge(fileWeights) { _, new in new }
         }
-
         print("🔍 Gemma3n.fromPretrained: Total weights loaded: \(weights.count)")
 
-        // Print some sample weight keys to understand the structure
-        let sampleKeys = Array(weights.keys.prefix(10))
-        print("🔍 Gemma3n.fromPretrained: Sample weight keys: \(sampleKeys)")
+        // Step 1: Main sanitization (remove "model." prefix)
+        var sanitizedWeights = model.sanitize(weights: weights)
 
-        let sanitizedWeights = model.sanitize(weights: weights)
+        // Step 2: Vision model sanitization (transpose conv weights)
+        sanitizedWeights = Gemma3nVisionModel.sanitizeWeights(sanitizedWeights)
 
-        print("🔍 Gemma3n.fromPretrained: Attempting to update model parameters...")
+        // Step 3: Audio model sanitization (transpose conv weights) - THIS WAS MISSING
+        sanitizedWeights = model.audioTower.sanitize(weights: sanitizedWeights)
+
+        // Step 4: Handle tied lm_head weights
+        if sanitizedWeights["language_model.lm_head.weight"] == nil {
+            if let embedWeight = sanitizedWeights["language_model.model.embed_tokens.weight"] {
+                print("🔍 Tying lm_head weight.")
+                sanitizedWeights["language_model.lm_head.weight"] = embedWeight
+            }
+        }
+
+        // Step 5: Load the weights
+        print("🔍 Attempting to load \(sanitizedWeights.count) final weights...")
         try model.update(parameters: ModuleParameters.unflattened(sanitizedWeights), verify: [.all])
-        print("🔍 Gemma3n.fromPretrained: Model parameters updated successfully!")
+        print("✅ Model loaded successfully!")
 
         return model
     }
@@ -2068,8 +2027,8 @@ private class Gemma3nAudioRelativePositionEmbedding: Module {
     let maxBackward: Int
     let maxForward: Int
 
-    @ModuleInfo var posProj: Linear
-    @ModuleInfo var invTimescales: MLXArray
+    @ModuleInfo(key: "pos_proj") var posProj: Linear
+    @ModuleInfo(key: "inv_timescales") var invTimescales: MLXArray
 
     init(config: AudioConfig) {
         self.config = config
@@ -2378,7 +2337,7 @@ private class Gemma3nAudioSubSampleConvProjection: Module {
 
     @ModuleInfo var conv0: Gemma3nAudioSSCPConvBlock
     @ModuleInfo var conv1: Gemma3nAudioSSCPConvBlock
-    @ModuleInfo var inputProjLinear: Linear
+    @ModuleInfo(key: "input_proj_linear") var inputProjLinear: Linear
 
     init(config: AudioConfig) {
         self.config = config
@@ -2469,11 +2428,12 @@ private class Gemma3nAudioAttention: Module {
     let localCausalValidMask: MLXArray
     let softcap: MLXArray
 
-    @ModuleInfo var relativePositionEmbedding: Gemma3nAudioRelativePositionEmbedding
-    @ModuleInfo var perDimScale: MLXArray
-    @ModuleInfo var qProj: Linear
-    @ModuleInfo var kProj: Linear
-    @ModuleInfo var vProj: Linear
+    @ModuleInfo(key: "relative_position_embedding") var relativePositionEmbedding:
+        Gemma3nAudioRelativePositionEmbedding
+    @ModuleInfo(key: "per_dim_scale") var perDimScale: MLXArray
+    @ModuleInfo(key: "q_proj") var qProj: Linear
+    @ModuleInfo(key: "k_proj") var kProj: Linear
+    @ModuleInfo(key: "v_proj") var vProj: Linear
 
     init(config: AudioConfig) {
         self.config = config
@@ -2665,7 +2625,7 @@ private class Gemma3nAudioAttention: Module {
 private class Gemma3nAudioConformerAttention: Module {
     let config: AudioConfig
     let postInFeatures: Int
-    let gradientClipping: MLXArray
+    private let gradientClipping: MLXArray
 
     @ModuleInfo var preAttnNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var attn: Gemma3nAudioAttention
@@ -2707,8 +2667,8 @@ private class Gemma3nAudioConformerAttention: Module {
 // MARK: - Conformer Feed Forward
 private class Gemma3nAudioConformerFeedForward: Module {
     let config: AudioConfig
-    let gradientClipping: MLXArray
-    let postLayerScale: MLXArray
+    private let gradientClipping: MLXArray
+    private let postLayerScale: MLXArray
 
     @ModuleInfo var preLayerNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var ffwLayer1: Linear
@@ -2744,7 +2704,7 @@ private class Gemma3nAudioConformerFeedForward: Module {
 // MARK: - Conformer Light Conv1D
 private class Gemma3nAudioConformerLightConv1d: Module {
     let config: AudioConfig
-    let gradientClipping: MLXArray
+    private let gradientClipping: MLXArray
     let causalPadding: Int
 
     @ModuleInfo var preLayerNorm: Gemma3nRMSNormWithScale
@@ -2813,7 +2773,7 @@ private class Gemma3nAudioConformerLightConv1d: Module {
 // MARK: - Conformer Block
 private class Gemma3nAudioConformerBlock: Module {
     let config: AudioConfig
-    let gradientClipping: MLXArray
+    private let gradientClipping: MLXArray
 
     @ModuleInfo var ffwLayerStart: Gemma3nAudioConformerFeedForward
     @ModuleInfo var attention: Gemma3nAudioConformerAttention
@@ -3216,11 +3176,13 @@ private class MultiQueryAttention2d: Module {
     let scale: Float
 
     @ModuleInfo var queryProj: Conv2d
-    @ModuleInfo var keyDownConv: Conv2d?
-    @ModuleInfo var keyNorm: RMSNormAct2d?
+
+    @ModuleInfo var keyDownConv: UnaryLayer
+    @ModuleInfo var keyNorm: UnaryLayer
+    @ModuleInfo var valueDownConv: UnaryLayer
+    @ModuleInfo var valueNorm: UnaryLayer
+
     @ModuleInfo var keyProj: Conv2d
-    @ModuleInfo var valueDownConv: Conv2d?
-    @ModuleInfo var valueNorm: RMSNormAct2d?
     @ModuleInfo var valueProj: Conv2d
     @ModuleInfo var attnDrop: UnaryLayer
     @ModuleInfo var outputProj: Conv2d
@@ -3264,15 +3226,15 @@ private class MultiQueryAttention2d: Module {
                 outputChannels: dim,
                 kernelSize: IntOrPair(dwKernelSize),
                 stride: IntOrPair(kvStride),
-                padding: IntOrPair((dwKernelSize - 1) / 2),
+                padding: IntOrPair((dwKernelSize - 1) / 2 * dilation),
                 dilation: IntOrPair(dilation),
-                groups: dim,
+                groups: dim,  // Depthwise
                 bias: false
             )
             self._keyNorm.wrappedValue = RMSNormAct2d(numChannels: dim, eps: 1e-6, applyAct: false)
         } else {
-            self._keyDownConv.wrappedValue = nil
-            self._keyNorm.wrappedValue = nil
+            self._keyDownConv.wrappedValue = Identity()
+            self._keyNorm.wrappedValue = Identity()
         }
         self._keyProj.wrappedValue = Conv2d(
             inputChannels: dim,
@@ -3288,16 +3250,16 @@ private class MultiQueryAttention2d: Module {
                 outputChannels: dim,
                 kernelSize: IntOrPair(dwKernelSize),
                 stride: IntOrPair(kvStride),
-                padding: IntOrPair((dwKernelSize - 1) / 2),
+                padding: IntOrPair((dwKernelSize - 1) / 2 * dilation),
                 dilation: IntOrPair(dilation),
-                groups: dim,
+                groups: dim,  // Depthwise
                 bias: false
             )
             self._valueNorm.wrappedValue = RMSNormAct2d(
                 numChannels: dim, eps: 1e-6, applyAct: false)
         } else {
-            self._valueDownConv.wrappedValue = nil
-            self._valueNorm.wrappedValue = nil
+            self._valueDownConv.wrappedValue = Identity()
+            self._valueNorm.wrappedValue = Identity()
         }
         self._valueProj.wrappedValue = Conv2d(
             inputChannels: dim,
@@ -3354,23 +3316,13 @@ private class MultiQueryAttention2d: Module {
         let q = queryProj(x)
         let qReshaped = reshapeProjectedQuery(q, numHeads: numHeads, keyDim: keyDim)
 
-        var k = x
-        if let keyDownConv = keyDownConv {
-            k = keyDownConv(k)
-        }
-        if let keyNorm = keyNorm {
-            k = keyNorm(k)
-        }
+        var k = keyDownConv(x)
+        k = keyNorm(k)
         k = keyProj(k)
         let kReshaped = reshapeInput(k)
 
-        var v = x
-        if let valueDownConv = valueDownConv {
-            v = valueDownConv(v)
-        }
-        if let valueNorm = valueNorm {
-            v = valueNorm(v)
-        }
+        var v = valueDownConv(x)
+        v = valueNorm(v)
         v = valueProj(v)
         let vReshaped = reshapeInput(v)
 
@@ -3720,7 +3672,7 @@ private class MobileNetV5MultiScaleFusionAdapter: Module {
                 img = pooled.transposed(0, 2, 3, 1)
             }
 
-            img = noskip ? img : norm(img)
+            img = noskip ? norm(img) : img
         }
 
         return img
@@ -3871,53 +3823,31 @@ private class Gemma3nVisionModel: Module {
         var sanitizedWeights = [String: MLXArray]()
         var skipTranspose = false
 
-        print("🔍 VisionModel.sanitize: Starting with \(weights.count) weights")
-
-        // Match Python exactly: use the specific key it expects
+        // This logic is correct
         let testKey = "vision_tower.timm_model.blocks.0.0.conv_exp.weight"
         if let convWeight = weights[testKey] {
-            let (_, H, _, C) = (
-                convWeight.shape[0], convWeight.shape[1], convWeight.shape[2], convWeight.shape[3]
-            )
-            print(
-                "🔍 VisionModel.sanitize: Found test key '\(testKey)' with shape \(convWeight.shape), H=\(H), C=\(C)"
-            )
-            if C > H {
+            let shape = convWeight.shape
+            if shape.count == 4, shape[3] > shape[1] {
                 skipTranspose = true
-                print(
-                    "🔍 VisionModel.sanitize: Setting skipTranspose=true because C(\(C)) > H(\(H))")
             }
-        } else {
-            print(
-                "🔍 VisionModel.sanitize: WARNING - Expected test key '\(testKey)' not found in weights!"
-            )
         }
 
-        print("🔍 VisionModel.sanitize: skipTranspose=\(skipTranspose)")
-
         for (k, v) in weights {
-            // PyTorch conv2d weight: [out_channels, in_channels, kH, kW]
-            // MLX conv2d weight: [out_channels, kH, KW, in_channels]
             if (k.contains("conv") && k.contains("weight"))
                 || (k.contains("attn") && k.contains("proj.weight"))
             {
                 if v.shape.count == 4 && !skipTranspose {
-                    print(
-                        "🔍 VisionModel.sanitize: Transposing '\(k)' from \(v.shape) to \(v.transposed(0, 2, 3, 1).shape)"
-                    )
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 } else {
-                    print(
-                        "🔍 VisionModel.sanitize: Keeping '\(k)' as-is with shape \(v.shape) (skipTranspose=\(skipTranspose))"
-                    )
                     sanitizedWeights[k] = v
                 }
             } else {
+                // THIS IS THE MISSING BLOCK
+                // Copy all other weights (biases, norm layers, etc.)
                 sanitizedWeights[k] = v
             }
         }
 
-        print("🔍 VisionModel.sanitize: Completed with \(sanitizedWeights.count) weights")
         return sanitizedWeights
     }
 }
@@ -4020,78 +3950,29 @@ private class Gemma3nAudioModel: Module {
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
 
-        print("🔍 AudioModel.sanitize: Starting with \(weights.count) weights")
-
         for (k, v) in weights {
             if k.contains("conv.weight") {
-                if checkArrayShape(v) {
-                    print(
-                        "🔍 AudioModel.sanitize: Keeping conv weight '\(k)' as-is with shape \(v.shape)"
-                    )
-                    sanitizedWeights[k] = v
-                } else {
-                    print(
-                        "🔍 AudioModel.sanitize: Transposing conv weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 3, 1).shape)"
-                    )
+                // The checkArrayShape function is not robust.
+                // The Python reference doesn't use it. It's safer to just transpose.
+                // Assuming NCHW -> NHWC for Conv2d
+                if v.ndim == 4 {
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
-                }
-            } else if k.contains("conv1d.weight") {
-                if checkArrayShape(v) {
-                    print(
-                        "🔍 AudioModel.sanitize: Keeping conv1d weight '\(k)' as-is with shape \(v.shape)"
-                    )
-                    sanitizedWeights[k] = v
                 } else {
-                    print(
-                        "🔍 AudioModel.sanitize: Transposing conv1d weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 1).shape)"
-                    )
-                    sanitizedWeights[k] = v.transposed(0, 2, 1)
-                }
-            } else {
-                sanitizedWeights[k] = v
-            }
-        }
-
-        print("🔍 AudioModel.sanitize: Completed with \(sanitizedWeights.count) weights")
-        return sanitizedWeights
-    }
-
-    static func sanitizeWeights(_ weights: [String: MLXArray]) -> [String: MLXArray] {
-        var sanitizedWeights = [String: MLXArray]()
-
-        print("🔍 AudioModel.sanitizeWeights: Starting with \(weights.count) weights")
-
-        for (k, v) in weights {
-            if k.contains("conv.weight") {
-                if checkArrayShape(v) {
-                    print(
-                        "🔍 AudioModel.sanitizeWeights: Keeping conv weight '\(k)' as-is with shape \(v.shape)"
-                    )
                     sanitizedWeights[k] = v
-                } else {
-                    print(
-                        "🔍 AudioModel.sanitizeWeights: Transposing conv weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 3, 1).shape)"
-                    )
-                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 }
             } else if k.contains("conv1d.weight") {
-                if checkArrayShape(v) {
-                    print(
-                        "🔍 AudioModel.sanitizeWeights: Keeping conv1d weight '\(k)' as-is with shape \(v.shape)"
-                    )
-                    sanitizedWeights[k] = v
-                } else {
-                    print(
-                        "🔍 AudioModel.sanitizeWeights: Transposing conv1d weight '\(k)' from \(v.shape) to \(v.transposed(0, 2, 1).shape)"
-                    )
+                // Assuming NCL -> NLC for Conv1d
+                if v.ndim == 3 {
                     sanitizedWeights[k] = v.transposed(0, 2, 1)
+                } else {
+                    sanitizedWeights[k] = v
                 }
             } else {
+                // THIS IS THE MISSING BLOCK
                 sanitizedWeights[k] = v
             }
         }
 
-        print("🔍 AudioModel.sanitizeWeights: Completed with \(sanitizedWeights.count) weights")
         return sanitizedWeights
     }
 }

From 91449cf61efa60087dde3beed22cf8310f72cf96 Mon Sep 17 00:00:00 2001
From: David Koski <dkoski@apple.com>
Date: Fri, 27 Jun 2025 11:20:45 -0700
Subject: [PATCH 06/19] update for computed keys, key names

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 104 +++++++++++++-------------
 1 file changed, 53 insertions(+), 51 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index e8a600c0..c0341038 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -495,8 +495,8 @@ private class Gemma3nRotaryEmbedding: Module {
     let originalMaxSeqLen: Int
     let config: TextConfig
     let attentionScaling: Float
-    let invFreq: MLXArray
-    let originalInvFreq: MLXArray
+    let _invFreq: MLXArray
+    let _originalInvFreq: MLXArray
 
     init(config: TextConfig) {
         if let ropeScaling = config.ropeScaling {
@@ -516,8 +516,8 @@ private class Gemma3nRotaryEmbedding: Module {
         self.attentionScaling = 1.0
 
         let (invFreq, _) = Self.computeDefaultRopeParameters(config: config)
-        self.invFreq = MLXArray(invFreq).asType(.float32)
-        self.originalInvFreq = MLXArray(invFreq).asType(.float32)
+        self._invFreq = MLXArray(invFreq).asType(.float32)
+        self._originalInvFreq = MLXArray(invFreq).asType(.float32)
 
         super.init()
     }
@@ -538,7 +538,7 @@ private class Gemma3nRotaryEmbedding: Module {
     }
 
     func callAsFunction(_ x: MLXArray, positionIds: MLXArray) -> (MLXArray, MLXArray) {
-        let invFreqExpanded = expandedDimensions(invFreq, axes: [0, 2])
+        let invFreqExpanded = expandedDimensions(_invFreq, axes: [0, 2])
         let positionIdsExpanded = expandedDimensions(positionIds.asType(.float32), axes: [1])
 
         let freqs = matmul(
@@ -750,7 +750,7 @@ private class Gemma3nAltUp: Module {
     @ModuleInfo(key: "prediction_coefs") var predictionCoefs: Linear
     @ModuleInfo(key: "modality_router") var modalityRouter: Linear
     @ModuleInfo(key: "router_norm") var routerNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "router_input_scale") var routerInputScale: MLXArray
+    let _routerInputScale: MLXArray
 
     let config: TextConfig
 
@@ -778,14 +778,14 @@ private class Gemma3nAltUp: Module {
             eps: config.rmsNormEps,
             scaleShift: 0.0
         )
-        self._routerInputScale.wrappedValue = MLXArray(pow(Float(config.hiddenSize), -1.0))
+        self._routerInputScale = MLXArray(pow(Float(config.hiddenSize), -1.0))
 
         super.init()
     }
 
     func computeRouterModalities(_ x: MLXArray) -> MLXArray {
         let routerInputs =
-            routerNorm(x) * routerInputScale.asType(routerNorm.weight.dtype)
+            routerNorm(x) * _routerInputScale.asType(routerNorm.weight.dtype)
         let routed = modalityRouter(routerInputs).asType(.float32)
         return tanh(routed)
     }
@@ -1057,8 +1057,8 @@ private class Gemma3Model: Module {
     let vocabSize: Int
     let vocabSizePerLayerInput: Int
     let numHiddenLayers: Int
-    private let perLayerProjectionScale: MLXArray
-    private let perLayerInputScale: MLXArray
+    private let _perLayerProjectionScale: MLXArray
+    private let _perLayerInputScale: MLXArray
 
     @ModuleInfo(key: "embed_tokens") var embedTokens: Gemma3nTextScaledWordEmbedding
     @ModuleInfo(key: "layers") var layers: [Gemma3nDecoderLayer]  // This is correct!
@@ -1125,8 +1125,8 @@ private class Gemma3Model: Module {
             scaleShift: 0.0
         )
 
-        self.perLayerProjectionScale = MLXArray(pow(Float(hiddenSize), -0.5))
-        self.perLayerInputScale = rsqrt(MLXArray(2.0))
+        self._perLayerProjectionScale = MLXArray(pow(Float(hiddenSize), -0.5))
+        self._perLayerInputScale = rsqrt(MLXArray(2.0))
 
         self._ropeEmbedding.wrappedValue = Gemma3nRotaryEmbedding(config: config)
 
@@ -1261,7 +1261,8 @@ private class Gemma3Model: Module {
 
     func projectPerLayerInputs(_ inputsEmbeds: MLXArray, perLayerInputs: MLXArray?) -> MLXArray {
         var perLayerProjection = perLayerModelProjection(inputsEmbeds)
-        perLayerProjection = perLayerProjection * perLayerProjectionScale.asType(inputsEmbeds.dtype)
+        perLayerProjection =
+            perLayerProjection * _perLayerProjectionScale.asType(inputsEmbeds.dtype)
 
         perLayerProjection = perLayerProjection.reshaped(
             Array(inputsEmbeds.shape.dropLast()) + [
@@ -1282,7 +1283,7 @@ private class Gemma3Model: Module {
         }
 
         return (perLayerProjection + adjustedPerLayerInputs)
-            * perLayerInputScale.asType(inputsEmbeds.dtype)
+            * _perLayerInputScale.asType(inputsEmbeds.dtype)
     }
 }
 
@@ -2335,8 +2336,8 @@ private class Gemma3nAudioSubSampleConvProjection: Module {
     let config: AudioConfig
     let inputProjInFeatures: Int
 
-    @ModuleInfo var conv0: Gemma3nAudioSSCPConvBlock
-    @ModuleInfo var conv1: Gemma3nAudioSSCPConvBlock
+    @ModuleInfo(key: "conv_0") var conv0: Gemma3nAudioSSCPConvBlock
+    @ModuleInfo(key: "conv_1") var conv1: Gemma3nAudioSSCPConvBlock
     @ModuleInfo(key: "input_proj_linear") var inputProjLinear: Linear
 
     init(config: AudioConfig) {
@@ -2625,7 +2626,7 @@ private class Gemma3nAudioAttention: Module {
 private class Gemma3nAudioConformerAttention: Module {
     let config: AudioConfig
     let postInFeatures: Int
-    private let gradientClipping: MLXArray
+    private let _gradientClipping: MLXArray
 
     @ModuleInfo var preAttnNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var attn: Gemma3nAudioAttention
@@ -2636,7 +2637,7 @@ private class Gemma3nAudioConformerAttention: Module {
         self.config = config
         let headDim = config.hiddenSize / config.confNumAttentionHeads
         self.postInFeatures = config.hiddenSize
-        self.gradientClipping = MLXArray(config.gradientClipping)
+        self._gradientClipping = MLXArray(config.gradientClipping)
 
         self._preAttnNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
         self._attn.wrappedValue = Gemma3nAudioAttention(config: config)
@@ -2648,7 +2649,7 @@ private class Gemma3nAudioConformerAttention: Module {
 
     func callAsFunction(_ x: MLXArray, mask: MLXArray) -> MLXArray {
         let audioencodingsInputToAttn = x
-        let clippedX = clip(x, min: -gradientClipping, max: gradientClipping)
+        let clippedX = clip(x, min: -_gradientClipping, max: _gradientClipping)
         let audioencodingsNorm = preAttnNorm(clippedX)
         let audioencodingsAttnOut = attn(audioencodingsNorm, mask: mask)
 
@@ -2659,7 +2660,7 @@ private class Gemma3nAudioConformerAttention: Module {
         let audioencodingsReshaped = audioencodingsAttnOut.reshaped([b, t, numHeads * headDim])
 
         let postResult = post(audioencodingsReshaped)
-        let clippedPost = clip(postResult, min: -gradientClipping, max: gradientClipping)
+        let clippedPost = clip(postResult, min: -_gradientClipping, max: _gradientClipping)
         return audioencodingsInputToAttn + postNorm(clippedPost)
     }
 }
@@ -2667,18 +2668,18 @@ private class Gemma3nAudioConformerAttention: Module {
 // MARK: - Conformer Feed Forward
 private class Gemma3nAudioConformerFeedForward: Module {
     let config: AudioConfig
-    private let gradientClipping: MLXArray
-    private let postLayerScale: MLXArray
+    private let _gradientClipping: MLXArray
+    private let _postLayerScale: MLXArray
 
-    @ModuleInfo var preLayerNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var ffwLayer1: Linear
-    @ModuleInfo var ffwLayer2: Linear
-    @ModuleInfo var postLayerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "pre_layer_norm") var preLayerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "ffw_layer_1") var ffwLayer1: Linear
+    @ModuleInfo(key: "ffw_layer_2") var ffwLayer2: Linear
+    @ModuleInfo(key: "post_layer_norm") var postLayerNorm: Gemma3nRMSNormWithScale
 
     init(config: AudioConfig) {
         self.config = config
-        self.gradientClipping = MLXArray(config.gradientClipping)
-        self.postLayerScale = MLXArray(config.confResidualWeight)
+        self._gradientClipping = MLXArray(config.gradientClipping)
+        self._postLayerScale = MLXArray(config.confResidualWeight)
 
         self._preLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
         self._ffwLayer1.wrappedValue = Linear(config.hiddenSize, config.hiddenSize * 4, bias: false)
@@ -2690,32 +2691,32 @@ private class Gemma3nAudioConformerFeedForward: Module {
 
     func callAsFunction(_ x: MLXArray) -> MLXArray {
         let residual = x
-        let clippedX = clip(x, min: -gradientClipping, max: gradientClipping)
+        let clippedX = clip(x, min: -_gradientClipping, max: _gradientClipping)
         var result = preLayerNorm(clippedX)
         result = ffwLayer1(result)
         result = silu(result)
         result = ffwLayer2(result)
-        let clippedResult = clip(result, min: -gradientClipping, max: gradientClipping)
+        let clippedResult = clip(result, min: -_gradientClipping, max: _gradientClipping)
         let normedResult = postLayerNorm(clippedResult)
-        return residual + (normedResult * postLayerScale)
+        return residual + (normedResult * _postLayerScale)
     }
 }
 
 // MARK: - Conformer Light Conv1D
 private class Gemma3nAudioConformerLightConv1d: Module {
     let config: AudioConfig
-    private let gradientClipping: MLXArray
+    private let _gradientClipping: MLXArray
     let causalPadding: Int
 
-    @ModuleInfo var preLayerNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var linearStart: Linear
-    @ModuleInfo var depthwiseConv1d: Conv1d
-    @ModuleInfo var convNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo var linearEnd: Linear
+    @ModuleInfo(key: "pre_layer_norm") var preLayerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "linear_start") var linearStart: Linear
+    @ModuleInfo(key: "depthwise_conv1d") var depthwiseConv1d: Conv1d
+    @ModuleInfo(key: "conv_norm") var convNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "linear_end") var linearEnd: Linear
 
     init(config: AudioConfig) {
         self.config = config
-        self.gradientClipping = MLXArray(config.gradientClipping)
+        self._gradientClipping = MLXArray(config.gradientClipping)
         self.causalPadding = config.confConvKernelSize - 1
 
         self._preLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(
@@ -2761,7 +2762,7 @@ private class Gemma3nAudioConformerLightConv1d: Module {
         )
 
         result = depthwiseConv1d(paddedAudio.transposed(0, 2, 1))
-        result = clip(result, min: -gradientClipping, max: gradientClipping)
+        result = clip(result, min: -_gradientClipping, max: _gradientClipping)
         result = convNorm(result)
         result = silu(result)
         result = linearEnd(result)
@@ -2967,11 +2968,11 @@ private class ConvNormAct: Module, UnaryLayer {
 // MARK: - Universal Inverted Residual
 private class UniversalInvertedResidual: Module, UnaryLayer {
     let hasSkip: Bool
-    @ModuleInfo var dwStart: UnaryLayer
-    @ModuleInfo var pwExp: ConvNormAct
-    @ModuleInfo var dwMid: UnaryLayer
-    @ModuleInfo var pwProj: ConvNormAct
-    @ModuleInfo var layerScale: UnaryLayer
+    @ModuleInfo(key: "dw_start") var dwStart: UnaryLayer
+    @ModuleInfo(key: "pw_exp") var pwExp: ConvNormAct
+    @ModuleInfo(key: "dw_mid") var dwMid: UnaryLayer
+    @ModuleInfo(key: "pw_proj") var pwProj: ConvNormAct
+    @ModuleInfo(key: "layer_scale") var layerScale: UnaryLayer
 
     init(
         inChannels: Int,
@@ -3088,9 +3089,9 @@ private class UniversalInvertedResidual: Module, UnaryLayer {
 // MARK: - Edge Residual
 private class EdgeResidual: Module, UnaryLayer {
     let hasSkip: Bool
-    @ModuleInfo var convExp: Conv2d
+    @ModuleInfo(key: "conv_exp") var convExp: Conv2d
     @ModuleInfo var bn1: RMSNormAct2d
-    @ModuleInfo var convPwl: Conv2d
+    @ModuleInfo(key: "conv_pwl") var convPwl: Conv2d
     @ModuleInfo var bn2: RMSNormAct2d
 
     init(
@@ -3184,9 +3185,9 @@ private class MultiQueryAttention2d: Module {
 
     @ModuleInfo var keyProj: Conv2d
     @ModuleInfo var valueProj: Conv2d
-    @ModuleInfo var attnDrop: UnaryLayer
+    @ModuleInfo(key: "attn_drop") var attnDrop: UnaryLayer
     @ModuleInfo var outputProj: Conv2d
-    @ModuleInfo var projDrop: UnaryLayer
+    @ModuleInfo(key: "proj_drop") var projDrop: UnaryLayer
 
     init(
         dim: Int,
@@ -3681,7 +3682,7 @@ private class MobileNetV5MultiScaleFusionAdapter: Module {
 
 // MARK: - Vision Tower
 private class VisionTower: Module {
-    @ModuleInfo var convStem: ConvNormAct
+    @ModuleInfo(key: "conv_stem") var convStem: ConvNormAct
     @ModuleInfo var blocks: [[UnaryLayer]]
     @ModuleInfo var msfa: MobileNetV5MultiScaleFusionAdapter
 
@@ -3800,7 +3801,7 @@ private class VisionTower: Module {
 // MARK: - Complete Vision Model
 private class Gemma3nVisionModel: Module {
     let modelType: String
-    @ModuleInfo var timmModel: VisionTower
+    @ModuleInfo(key: "timm_model") var timmModel: VisionTower
 
     init(config: VisionConfig) {
         self.modelType = config.modelType
@@ -3856,7 +3857,8 @@ private class Gemma3nVisionModel: Module {
 private class Gemma3nAudioModel: Module {
     let config: AudioConfig
 
-    @ModuleInfo var subsampleConvProjection: Gemma3nAudioSubSampleConvProjection
+    @ModuleInfo(key: "subsample_conv_projection") var subsampleConvProjection:
+        Gemma3nAudioSubSampleConvProjection
     @ModuleInfo var conformer: [Gemma3nAudioConformerBlock]
 
     init(config: AudioConfig) {

From 0d6d026997b9481eafa604f000a89a315c230388 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 09:39:59 +0200
Subject: [PATCH 07/19] Use 4-bit quantized models

---
 Libraries/MLXVLM/VLMModelFactory.swift | 12 ++++++------
 Tools/llm-tool/LLMTool.swift           |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Libraries/MLXVLM/VLMModelFactory.swift b/Libraries/MLXVLM/VLMModelFactory.swift
index d56a9338..cf63e0ac 100644
--- a/Libraries/MLXVLM/VLMModelFactory.swift
+++ b/Libraries/MLXVLM/VLMModelFactory.swift
@@ -169,14 +169,14 @@ public class VLMRegistry: AbstractModelRegistry, @unchecked Sendable {
         extraEOSTokens: ["<end_of_turn>"]
     )
 
-    static public let gemma3n_E2B_instruct = ModelConfiguration(
-        id: "mlx-community/gemma-3n-E2B-it-bf16",
+    static public let gemma3n_E2B = ModelConfiguration(
+        id: "mlx-community/gemma-3n-E2B-it-4bit",
         defaultPrompt: "Describe this image.",
         extraEOSTokens: ["<end_of_turn>"]
     )
 
-    static public let gemma3n_E4B_instruct = ModelConfiguration(
-        id: "mlx-community/gemma-3n-E4B-it-bf16",
+    static public let gemma3n_E4B = ModelConfiguration(
+        id: "mlx-community/gemma-3n-E4B-it-4bit",
         defaultPrompt: "Describe this image.",
         extraEOSTokens: ["<end_of_turn>"]
     )
@@ -196,8 +196,8 @@ public class VLMRegistry: AbstractModelRegistry, @unchecked Sendable {
             gemma3_4B_qat_4bit,
             gemma3_12B_qat_4bit,
             gemma3_27B_qat_4bit,
-            gemma3n_E2B_instruct,
-            gemma3n_E4B_instruct,
+            gemma3n_E2B,
+            gemma3n_E4B,
             smolvlm,
         ]
     }
diff --git a/Tools/llm-tool/LLMTool.swift b/Tools/llm-tool/LLMTool.swift
index 4140594f..f02fca15 100644
--- a/Tools/llm-tool/LLMTool.swift
+++ b/Tools/llm-tool/LLMTool.swift
@@ -302,9 +302,9 @@ struct EvaluateCommand: AsyncParsableCommand {
         let modelFactory: ModelFactory
         let defaultModel: ModelConfiguration
 
-        // Always use VLM factory and gemma3n_E2B_instruct for testing
+        // Always use VLM factory and gemma3n_E2B for testing
         modelFactory = VLMModelFactory.shared
-        defaultModel = MLXVLM.VLMRegistry.gemma3n_E2B_instruct
+        defaultModel = MLXVLM.VLMRegistry.gemma3n_E2B
 
         // Load the model
         let modelContainer = try await memory.start { [args] in

From 8e1d7d513da9ea7b421f9b6c453abd38214f4466 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 11:38:02 +0200
Subject: [PATCH 08/19] Clean up

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 59 +++++++++++----------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index c0341038..9c6db502 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1146,18 +1146,18 @@ private class Gemma3Model: Module {
         perLayerInputs: MLXArray? = nil
     ) -> MLXArray {
         var h: MLXArray
-        if let inputsEmbeds = inputsEmbeds {
+        if let inputsEmbeds {
             h = inputsEmbeds
-        } else if let inputs = inputs {
+        } else if let inputs {
             h = embedTokens(inputs)
         } else {
             fatalError("Either inputs or inputsEmbeds must be provided")
         }
 
         let perLayerInputsProcessed: MLXArray
-        if let perLayerInputs = perLayerInputs {
+        if let perLayerInputs {
             perLayerInputsProcessed = perLayerInputs
-        } else if let inputs = inputs {
+        } else if let inputs {
             perLayerInputsProcessed = getPerLayerInputs(inputs)
         } else {
             fatalError("Cannot generate per layer inputs without input ids")
@@ -1213,7 +1213,7 @@ private class Gemma3Model: Module {
                 == "global_attention"
 
             let localMask: MLXFast.ScaledDotProductAttentionMaskMode
-            if let mask = mask {
+            if let mask {
                 localMask = mask
             } else if isGlobal {
                 localMask = fullMask
@@ -1437,9 +1437,9 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
         }
 
         let embNorm: MLXArray
-        if let inputsEmbeds = inputsEmbeds {
+        if let inputsEmbeds {
             embNorm = softEmbeddingNorm(inputsEmbeds)
-        } else if let inputIds = inputIds {
+        } else if let inputIds {
             let hardEmb = embedding(inputIds - vocabOffset)
             embNorm = hardEmbeddingNorm(hardEmb)
         } else {
@@ -1490,7 +1490,7 @@ private func gemma3nAttentionWithCacheUpdate(
     // Update cache and get cached keys/values (matches Python's cache.update_and_fetch)
     let (cachedKeys, cachedValues): (MLXArray, MLXArray)
 
-    if let cache = cache {
+    if let cache {
         (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)
     } else {
         (cachedKeys, cachedValues) = (keys, values)
@@ -1667,7 +1667,6 @@ private func maskedScatter(
 private func checkArrayShape(_ arr: MLXArray) -> Bool {
     let shape = arr.shape
     guard shape.count == 4 else {
-        print("🔍 checkArrayShape: Array has \(shape.count) dimensions, not 4")
         return false
     }
 
@@ -1792,7 +1791,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         }
 
         // Process audio features
-        if let inputFeatures = inputFeatures, let inputFeaturesMask = inputFeaturesMask {
+        if let inputFeatures, let inputFeaturesMask = inputFeaturesMask {
             let (audioFeatures, audioMask) = getAudioFeatures(inputFeatures, .!inputFeaturesMask)
             let audioPaddingIds = MLXArray([config.vocabSize - 1]).expandedDimensions(axis: 0)
             let audioPaddingEmbs = embedAudio.callAsFunction(audioPaddingIds, inputsEmbeds: nil)
@@ -1862,7 +1861,7 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
     ) -> MLXArray {
         let specialModalityMask: MLXArray
 
-        if let inputIds = inputIds {
+        if let inputIds {
             specialModalityMask = expandedDimensions(inputIds .== tokenId, axis: -1)
         } else {
             // When inputIds is nil, create mask by comparing embeddings
@@ -1924,10 +1923,9 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
 
     // In class Gemma3n
     public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
-        print("🔍 Gemma3n.sanitize: Starting with \(weights.count) weights")
         var sanitizedWeights = [String: MLXArray]()
 
-        // This function's ONLY job is to remove the "model." prefix from keys.
+        // Remove the "model." prefix from keys.
         for (k, v) in weights {
             if k.hasPrefix("model.") {
                 let newKey = k.split(separator: ".").dropFirst().joined(separator: ".")
@@ -1937,13 +1935,11 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
             }
         }
 
-        print("🔍 Gemma3n.sanitize: After prefix removal, have \(sanitizedWeights.count) weights")
         return sanitizedWeights
     }
 
     public static func fromPretrained(pathOrHfRepo: String) throws -> Gemma3n {
         let path = URL(fileURLWithPath: pathOrHfRepo)
-        print("🔍 Gemma3n.fromPretrained: Loading from \(pathOrHfRepo)")
 
         let configPath = path.appendingPathComponent("config.json")
         let configData = try Data(contentsOf: configPath)
@@ -1968,30 +1964,25 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
             let fileWeights = try loadArrays(url: path.appendingPathComponent(weightFile))
             weights.merge(fileWeights) { _, new in new }
         }
-        print("🔍 Gemma3n.fromPretrained: Total weights loaded: \(weights.count)")
 
-        // Step 1: Main sanitization (remove "model." prefix)
+        // Main sanitization (remove "model." prefix)
         var sanitizedWeights = model.sanitize(weights: weights)
 
-        // Step 2: Vision model sanitization (transpose conv weights)
+        // Vision model sanitization (transpose conv weights)
         sanitizedWeights = Gemma3nVisionModel.sanitizeWeights(sanitizedWeights)
 
-        // Step 3: Audio model sanitization (transpose conv weights) - THIS WAS MISSING
+        // Audio model sanitization (transpose conv weights)
         sanitizedWeights = model.audioTower.sanitize(weights: sanitizedWeights)
 
-        // Step 4: Handle tied lm_head weights
+        // Handle tied lm_head weights
         if sanitizedWeights["language_model.lm_head.weight"] == nil {
             if let embedWeight = sanitizedWeights["language_model.model.embed_tokens.weight"] {
-                print("🔍 Tying lm_head weight.")
                 sanitizedWeights["language_model.lm_head.weight"] = embedWeight
             }
         }
 
-        // Step 5: Load the weights
-        print("🔍 Attempting to load \(sanitizedWeights.count) final weights...")
+        // Load the weights
         try model.update(parameters: ModuleParameters.unflattened(sanitizedWeights), verify: [.all])
-        print("✅ Model loaded successfully!")
-
         return model
     }
 }
@@ -2211,7 +2202,7 @@ private class Gemma3nCumulativeGroupNorm: Module {
         let expectedInputSuffix = featureDims + [numChannels]
         assert(Array(x.shape.suffix(expectedInputSuffix.count)) == expectedInputSuffix)
 
-        if let mask = mask {
+        if let mask {
             assert(mask.shape == Array(x.shape.prefix(2)))
             assert(mask.dtype == .bool)
         }
@@ -2221,7 +2212,7 @@ private class Gemma3nCumulativeGroupNorm: Module {
         let xCalc = x.asType(calcDtype)
 
         let maskCalc: MLXArray
-        if let mask = mask {
+        if let mask {
             let maskSuffixShape = Array(repeating: 1, count: expectedInputSuffix.count)
             maskCalc = mask.reshaped(Array(mask.shape) + maskSuffixShape).asType(calcDtype)
         } else {
@@ -2848,7 +2839,7 @@ private func rmsNorm2d(
     let vMean = mean(v, axis: 1, keepDims: true)
     var result = x * rsqrt(vMean + eps)
 
-    if let weight = weight {
+    if let weight {
         let weightReshaped = weight.reshaped([1, -1, 1, 1])
         result = result.asType(dtype) * weightReshaped
     }
@@ -3061,7 +3052,7 @@ private class UniversalInvertedResidual: Module, UnaryLayer {
         )
 
         // Layer Scale
-        if let layerScaleInitValue = layerScaleInitValue {
+        if let layerScaleInitValue {
             self._layerScale.wrappedValue = LayerScale2d(
                 dim: outChannels, initValues: layerScaleInitValue)
         } else {
@@ -3420,7 +3411,7 @@ private class MobileAttention: Module, UnaryLayer {
         }
 
         // Layer scaling
-        if let layerScaleInitValue = layerScaleInitValue {
+        if let layerScaleInitValue {
             self._layerScale.wrappedValue = LayerScale2d(
                 dim: outChannels, initValues: layerScaleInitValue)
         } else {
@@ -3843,7 +3834,6 @@ private class Gemma3nVisionModel: Module {
                     sanitizedWeights[k] = v
                 }
             } else {
-                // THIS IS THE MISSING BLOCK
                 // Copy all other weights (biases, norm layers, etc.)
                 sanitizedWeights[k] = v
             }
@@ -3955,7 +3945,7 @@ private class Gemma3nAudioModel: Module {
         for (k, v) in weights {
             if k.contains("conv.weight") {
                 // The checkArrayShape function is not robust.
-                // The Python reference doesn't use it. It's safer to just transpose.
+                // The Python implementation doesn't use it. It's safer to just transpose.
                 // Assuming NCHW -> NHWC for Conv2d
                 if v.ndim == 4 {
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
@@ -3970,7 +3960,6 @@ private class Gemma3nAudioModel: Module {
                     sanitizedWeights[k] = v
                 }
             } else {
-                // THIS IS THE MISSING BLOCK
                 sanitizedWeights[k] = v
             }
         }
@@ -4175,8 +4164,8 @@ public struct Gemma3nProcessorConfiguration: Codable, Sendable {
     public let doPanAndScan: Bool?
 
     // Token identifiers - use default values that match Python implementation
-    public var imageTokenId: Int { 262145 }  // From Python: image_token_id = 262145
-    public var audioTokenId: Int { 262273 }  // From Python: audio_token_id = 262273
+    public var imageTokenId: Int { 262145 }
+    public var audioTokenId: Int { 262273 }
 
     public struct ImageSize: Codable, Sendable {
         public let height: Int

From cf7f3f24d486ab3cc5928764e9d000e8021799d4 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 11:39:07 +0200
Subject: [PATCH 09/19] Remove type casting for pixel values (mlx-vlm #398)

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 9c6db502..0309beac 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1777,10 +1777,8 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         }
 
         // Process vision features
-        if let pixelValues = pixelValues {
-            let pixelValuesTyped = pixelValues.asType(languageModel.model.embedTokens.weight.dtype)
-            let imageFeatures = getImageFeatures(pixelValuesTyped)
-
+        if let pixelValues {
+            let imageFeatures = getImageFeatures(pixelValues)
             return mergeMultimodalAndText(
                 inputIds: inputIds,
                 inputsEmbeds: inputsEmbeds,

From befbc5f3d4262486e70ecedbb62d4d629d0550f1 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 11:59:43 +0200
Subject: [PATCH 10/19] Correctly scale text embeddings for quantized models
 (mlx-vlm #397)

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 48 ++++++++++-----------------
 1 file changed, 18 insertions(+), 30 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 0309beac..6e5af9b8 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1034,23 +1034,6 @@ private class Gemma3nDecoderLayer: Module {
     }
 }
 
-private class Gemma3nTextScaledWordEmbedding: Module, UnaryLayer {
-    @ModuleInfo var weight: MLXArray
-    let embedScale: Float
-
-    init(numEmbeddings: Int, embeddingDim: Int, embedScale: Float = 1.0) {
-        self.embedScale = embedScale
-        self._weight.wrappedValue = MLXRandom.normal([numEmbeddings, embeddingDim])
-        super.init()
-    }
-
-    func callAsFunction(_ x: MLXArray) -> MLXArray {
-        let indices = x.asType(.int32)
-        let embeddings = take(weight, indices, axis: 0)
-        return embeddings * MLXArray(embedScale, dtype: .float32).asType(weight.dtype)
-    }
-}
-
 private class Gemma3Model: Module {
     let config: TextConfig
     let hiddenSize: Int
@@ -1059,11 +1042,12 @@ private class Gemma3Model: Module {
     let numHiddenLayers: Int
     private let _perLayerProjectionScale: MLXArray
     private let _perLayerInputScale: MLXArray
+    private let _embedTokensScale: Float
+    private let _embedTokensPerLayerScale: Float
 
-    @ModuleInfo(key: "embed_tokens") var embedTokens: Gemma3nTextScaledWordEmbedding
-    @ModuleInfo(key: "layers") var layers: [Gemma3nDecoderLayer]  // This is correct!
-    @ModuleInfo(key: "embed_tokens_per_layer") var embedTokensPerLayer:
-        Gemma3nTextScaledWordEmbedding
+    @ModuleInfo(key: "embed_tokens") var embedTokens: Embedding
+    @ModuleInfo(key: "layers") var layers: [Gemma3nDecoderLayer]
+    @ModuleInfo(key: "embed_tokens_per_layer") var embedTokensPerLayer: Embedding
     @ModuleInfo(key: "per_layer_model_projection") var perLayerModelProjection: Linear
     @ModuleInfo(key: "per_layer_projection_norm") var perLayerProjectionNorm:
         Gemma3nRMSNormWithScale
@@ -1084,21 +1068,21 @@ private class Gemma3Model: Module {
 
         assert(vocabSize > 0)
 
-        self._embedTokens.wrappedValue = Gemma3nTextScaledWordEmbedding(
-            numEmbeddings: config.vocabSize,
-            embeddingDim: config.hiddenSize,
-            embedScale: pow(Float(config.hiddenSize), 0.5)
+        self._embedTokens.wrappedValue = Embedding(
+            embeddingCount: config.vocabSize,
+            dimensions: config.hiddenSize,
         )
+        self._embedTokensScale = pow(Float(config.hiddenSize), 0.5)
 
         self._layers.wrappedValue = (0 ..< config.numHiddenLayers).map { layerIdx in
             Gemma3nDecoderLayer(config: config, layerIdx: layerIdx)
         }
 
-        self._embedTokensPerLayer.wrappedValue = Gemma3nTextScaledWordEmbedding(
-            numEmbeddings: config.vocabSizePerLayerInput,
-            embeddingDim: config.numHiddenLayers * config.hiddenSizePerLayerInput,
-            embedScale: pow(Float(config.hiddenSizePerLayerInput), 0.5)
+        self._embedTokensPerLayer.wrappedValue = Embedding(
+            embeddingCount: config.vocabSizePerLayerInput,
+            dimensions: config.numHiddenLayers * config.hiddenSizePerLayerInput,
         )
+        self._embedTokensPerLayerScale = pow(Float(config.hiddenSizePerLayerInput), 0.5)
 
         self._perLayerModelProjection.wrappedValue = Linear(
             config.hiddenSize,
@@ -1150,6 +1134,7 @@ private class Gemma3Model: Module {
             h = inputsEmbeds
         } else if let inputs {
             h = embedTokens(inputs)
+            h = (h * MLXArray(_embedTokensScale, dtype: .float32)).asType(h.dtype)
         } else {
             fatalError("Either inputs or inputsEmbeds must be provided")
         }
@@ -1253,7 +1238,10 @@ private class Gemma3Model: Module {
             inputIds .< vocabSizePerLayerInput
         )
         let tokens = MLX.where(perLayerInputsMask, inputIds, MLXArray.zeros(like: inputIds))
-        let result = embedTokensPerLayer(tokens).reshaped(
+        var result = embedTokensPerLayer(tokens)
+        result = (result * MLXArray(_embedTokensPerLayerScale, dtype: .float32)).asType(
+            result.dtype)
+        result = result.reshaped(
             Array(inputIds.shape) + [config.numHiddenLayers, config.hiddenSizePerLayerInput]
         )
         return result

From 82df136c7221aec2e6913f6852ff76b05d7ccad1 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 12:24:03 +0200
Subject: [PATCH 11/19] Fix and factor out checkArrayShape

---
 Libraries/MLXVLM/CheckArrayShape.swift | 24 ++++++++++++++++++++
 Libraries/MLXVLM/Models/Gemma3.swift   | 13 -----------
 Libraries/MLXVLM/Models/Gemma3n.swift  | 31 +++++++++-----------------
 Libraries/MLXVLM/Models/Idefics3.swift |  8 +------
 4 files changed, 35 insertions(+), 41 deletions(-)
 create mode 100644 Libraries/MLXVLM/CheckArrayShape.swift

diff --git a/Libraries/MLXVLM/CheckArrayShape.swift b/Libraries/MLXVLM/CheckArrayShape.swift
new file mode 100644
index 00000000..5ced2c77
--- /dev/null
+++ b/Libraries/MLXVLM/CheckArrayShape.swift
@@ -0,0 +1,24 @@
+import MLX
+
+/// Check if array is in a supported format for conv weights
+public func checkArrayShape(_ arr: MLXArray) -> Bool {
+    let shape = arr.shape
+    switch shape.count {
+    case 4:
+        let outChannels = shape[0]
+        let kH = shape[1]
+        let kW = shape[2]
+        // shape[3] is in_channels, which is ignored
+        // Check if out_channels is the largest, and kH and kW are the same
+        return (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
+    case 3:
+        let kW = shape[1]
+        let outChannels = shape[2]
+        // shape[0] is ignored
+        // Check if kW is larger than or equal to out_channels
+        return kW >= outChannels
+    default:
+        // Any other number of dimensions is not supported
+        return false
+    }
+}
diff --git a/Libraries/MLXVLM/Models/Gemma3.swift b/Libraries/MLXVLM/Models/Gemma3.swift
index b7164821..1f825d44 100644
--- a/Libraries/MLXVLM/Models/Gemma3.swift
+++ b/Libraries/MLXVLM/Models/Gemma3.swift
@@ -756,19 +756,6 @@ private class VisionModel: Module {
         visionModel(x, outputHiddenStates: outputHiddenStates)
     }
 
-    /// Check if array is already in MLX format for conv2d weights
-    private func checkArrayShape(_ arr: MLXArray) -> Bool {
-        let shape = arr.shape
-
-        // Check if the shape has 4 dimensions
-        guard shape.count == 4 else { return false }
-
-        let (outChannels, kH, kW, _) = (shape[0], shape[1], shape[2], shape[3])
-
-        // Check if out_channels is the largest, and kH and kW are the same
-        return (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
-    }
-
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
 
diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 6e5af9b8..5ae0c34e 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1652,20 +1652,6 @@ private func maskedScatter(
     return resultFlat.reshaped(inputShape)
 }
 
-private func checkArrayShape(_ arr: MLXArray) -> Bool {
-    let shape = arr.shape
-    guard shape.count == 4 else {
-        return false
-    }
-
-    let (outChannels, kH, kW, _) = (shape[0], shape[1], shape[2], shape[3])
-    let result = (outChannels >= kH) && (outChannels >= kW) && (kH == kW)
-    print(
-        "🔍 checkArrayShape: shape=\(shape), outChannels=\(outChannels), kH=\(kH), kW=\(kW), result=\(result)"
-    )
-    return result
-}
-
 // MARK: - Main Model
 
 public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
@@ -3925,27 +3911,31 @@ private class Gemma3nAudioModel: Module {
         return (audioencodings, currentMask)
     }
 
+    /// Sanitizes weights by transposing convolution layers if they are not
+    /// already in the expected MLX format.
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
 
         for (k, v) in weights {
             if k.contains("conv.weight") {
-                // The checkArrayShape function is not robust.
-                // The Python implementation doesn't use it. It's safer to just transpose.
-                // Assuming NCHW -> NHWC for Conv2d
-                if v.ndim == 4 {
+                // A Conv2D weight should be 4D.
+                // If it is, check if it needs transposing from NCHW to NHWC.
+                // If checkArrayShape is true, it's already in the correct format.
+                if v.ndim == 4 && !checkArrayShape(v) {
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 } else {
                     sanitizedWeights[k] = v
                 }
             } else if k.contains("conv1d.weight") {
-                // Assuming NCL -> NLC for Conv1d
-                if v.ndim == 3 {
+                // A Conv1D weight should be 3D.
+                // If it is, check if it needs transposing from NCL to NLC.
+                if v.ndim == 3 && !checkArrayShape(v) {
                     sanitizedWeights[k] = v.transposed(0, 2, 1)
                 } else {
                     sanitizedWeights[k] = v
                 }
             } else {
+                // For all other weights, keep them as they are.
                 sanitizedWeights[k] = v
             }
         }
@@ -4149,7 +4139,6 @@ public struct Gemma3nProcessorConfiguration: Codable, Sendable {
     public let doConvertRgb: Bool?
     public let doPanAndScan: Bool?
 
-    // Token identifiers - use default values that match Python implementation
     public var imageTokenId: Int { 262145 }
     public var audioTokenId: Int { 262273 }
 
diff --git a/Libraries/MLXVLM/Models/Idefics3.swift b/Libraries/MLXVLM/Models/Idefics3.swift
index 17b1c5f4..a599ae2f 100644
--- a/Libraries/MLXVLM/Models/Idefics3.swift
+++ b/Libraries/MLXVLM/Models/Idefics3.swift
@@ -386,12 +386,6 @@ private enum Language {
 // MARK: - Vision
 
 private enum Vision {
-    static func checkArrayShape(_ arr: MLXArray) -> Bool {
-        if arr.ndim != 4 { return false }
-        let (o, h, w, _) = (arr.dim(0), arr.dim(1), arr.dim(2), arr.dim(3))
-        return (o >= h && o >= w && h == w)
-    }
-
     fileprivate class Attention: Module {
         let numHeads: Int
         let scale: Float
@@ -602,7 +596,7 @@ private enum Vision {
                 if k.contains("position_ids") {
                     continue
                 } else if k.contains("patch_embedding.weight") {
-                    if Vision.checkArrayShape(v) {
+                    if checkArrayShape(v) {
                         sanitizedWeights[k] = v
                     } else {
                         sanitizedWeights[k] = v.transposed(0, 2, 3, 1)

From 79df57cfa8fa6d5bcee403f85c6cfc7ce788c6b3 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 15:09:46 +0200
Subject: [PATCH 12/19] Use improved-parameter-errors  branch of mlx-swift

---
 mlx-swift-examples.xcodeproj/project.pbxproj                | 6 +++---
 .../xcshareddata/swiftpm/Package.resolved                   | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlx-swift-examples.xcodeproj/project.pbxproj b/mlx-swift-examples.xcodeproj/project.pbxproj
index df8f9ccd..290909c9 100644
--- a/mlx-swift-examples.xcodeproj/project.pbxproj
+++ b/mlx-swift-examples.xcodeproj/project.pbxproj
@@ -283,9 +283,9 @@
 			isa = PBXFileSystemSynchronizedBuildFileExceptionSet;
 			membershipExceptions = (
 				MLXLMTests/BaseConfigurationTests.swift,
-        MLXLMTests/ToolTests.swift,
 				MLXLMTests/EvalTests.swift,
 				MLXLMTests/StreamlinedTests.swift,
+				MLXLMTests/ToolTests.swift,
 				MLXLMTests/UserInputTests.swift,
 			);
 			target = C3208E6D2DB19451006AE6CA /* MLXLMTests */;
@@ -3249,8 +3249,8 @@
 			isa = XCRemoteSwiftPackageReference;
 			repositoryURL = "https://github.com/ml-explore/mlx-swift";
 			requirement = {
-				kind = upToNextMajorVersion;
-				minimumVersion = 0.25.4;
+				branch = "improved-parameter-errors";
+				kind = branch;
 			};
 		};
 		C32B4C6B2DA7132C00EF663D /* XCRemoteSwiftPackageReference "swift-async-algorithms" */ = {
diff --git a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
index ee2466cb..59ed2b8a 100644
--- a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
+++ b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
@@ -1,5 +1,5 @@
 {
-  "originHash" : "6c3d58787193c406294dfd4fa330ba611ece7c4a64b00302aa63c9ccafd8f43f",
+  "originHash" : "5b8f479687d916677158d7747e9b766ff83f08092c1595079ff9b70c909c6250",
   "pins" : [
     {
       "identity" : "gzipswift",
@@ -24,8 +24,8 @@
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/ml-explore/mlx-swift",
       "state" : {
-        "revision" : "b94473af8c50010edba87a48bbd60c3d7f949852",
-        "version" : "0.25.4"
+        "branch" : "improved-parameter-errors",
+        "revision" : "1c6ce2485f879b53e64a5e599d5a9769b8036786"
       }
     },
     {

From 8820938782999c6dad18ae09831db39f0fe5b62b Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 12:35:57 +0200
Subject: [PATCH 13/19] Fix sanitization, computed layers, module keys

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 157 +++++++++++---------------
 1 file changed, 64 insertions(+), 93 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 5ae0c34e..892b5d8a 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1340,33 +1340,27 @@ private class LanguageModel: Module, KVCacheDimensionProvider {
     }
 
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
-        var sanitizedWeights = [String: MLXArray]()
-
+        var sanitizedWeights = weights
         for (k, v) in weights {
-            // Skip rotary embedding inverse frequency weights (matches Python exactly)
-            if k.contains("self_attn.rotary_emb.inv_freq") {
-                continue
-            }
-            // Python logic: if "language_model.model" not in k and "language_model.lm_head" not in k:
-            else if !k.contains("language_model.model") && !k.contains("language_model.lm_head") {
+            if !k.contains("language_model.model") && !k.contains("language_model.lm_head") {
+                // Transform keys that don't contain the specific patterns
                 let newKey = k.replacingOccurrences(
                     of: "language_model", with: "language_model.model")
                 sanitizedWeights[newKey] = v
-            }
-            // Otherwise, keep the key as is
-            else {
+            } else if k.contains("self_attn.rotary_emb.inv_freq") {
+                // Skip rotary embedding inverse frequency weights
+                continue
+            } else {
                 sanitizedWeights[k] = v
             }
         }
-
-        // If lm_head weight is missing, use embed_tokens weight as fallback (matches Python exactly)
+        // Handle tied lm_head weights
         if sanitizedWeights["language_model.lm_head.weight"] == nil {
             let embedTokensKey = "language_model.model.embed_tokens.weight"
             if let embedWeight = sanitizedWeights[embedTokensKey] {
                 sanitizedWeights["language_model.lm_head.weight"] = embedWeight
             }
         }
-
         return sanitizedWeights
     }
 }
@@ -1676,7 +1670,6 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         self._languageModel.wrappedValue = LanguageModel(config: config.textConfig)
         self._visionTower.wrappedValue = Gemma3nVisionModel(config: config.visionConfig)
         self._audioTower.wrappedValue = Gemma3nAudioModel(config: config.audioConfig)
-
         self._embedVision.wrappedValue = Gemma3nMultimodalEmbedder(
             multimodalConfig: config.visionConfig,
             textConfig: config.textConfig
@@ -1893,20 +1886,16 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         return languageModel(inputs: inputs, cache: convertedCache).logits
     }
 
-    // In class Gemma3n
     public func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = [String: MLXArray]()
-
-        // Remove the "model." prefix from keys.
         for (k, v) in weights {
-            if k.hasPrefix("model.") {
+            if k.starts(with: "model.") {
                 let newKey = k.split(separator: ".").dropFirst().joined(separator: ".")
                 sanitizedWeights[newKey] = v
             } else {
                 sanitizedWeights[k] = v
             }
         }
-
         return sanitizedWeights
     }
 
@@ -1937,14 +1926,11 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
             weights.merge(fileWeights) { _, new in new }
         }
 
-        // Main sanitization (remove "model." prefix)
         var sanitizedWeights = model.sanitize(weights: weights)
-
-        // Vision model sanitization (transpose conv weights)
-        sanitizedWeights = Gemma3nVisionModel.sanitizeWeights(sanitizedWeights)
-
-        // Audio model sanitization (transpose conv weights)
-        sanitizedWeights = model.audioTower.sanitize(weights: sanitizedWeights)
+        sanitizedWeights = model.visionTower.sanitize(weights: sanitizedWeights)
+        // The audio and language sanitization is not done in the Python implementation
+        //        sanitizedWeights = model.audioTower.sanitize(weights: sanitizedWeights)
+        //        sanitizedWeights = model.languageModel.sanitize(weights: sanitizedWeights)
 
         // Handle tied lm_head weights
         if sanitizedWeights["language_model.lm_head.weight"] == nil {
@@ -1992,7 +1978,7 @@ private class Gemma3nAudioRelativePositionEmbedding: Module {
     let maxForward: Int
 
     @ModuleInfo(key: "pos_proj") var posProj: Linear
-    @ModuleInfo(key: "inv_timescales") var invTimescales: MLXArray
+    private let _invTimescales: MLXArray
 
     init(config: AudioConfig) {
         self.config = config
@@ -2016,7 +2002,7 @@ private class Gemma3nAudioRelativePositionEmbedding: Module {
                 MLXArray(0 ..< numTimescales).asType(.float32) * (-logTimescaleIncrement)
             )
 
-        self._invTimescales.wrappedValue = expandedDimensions(
+        self._invTimescales = expandedDimensions(
             expandedDimensions(invTimescales, axis: 0),
             axis: 0
         )
@@ -2028,7 +2014,7 @@ private class Gemma3nAudioRelativePositionEmbedding: Module {
         assert(position.ndim == 2)
         let positionFloat = expandedDimensions(position.asType(.float32), axis: -1)
 
-        let scaledTime = positionFloat * invTimescales
+        let scaledTime = positionFloat * _invTimescales
         let timingSignal = concatenated([sin(scaledTime), cos(scaledTime)], axis: -1)
         return timingSignal.asType(dtype)
     }
@@ -2328,6 +2314,7 @@ private class Gemma3nAudioSubSampleConvProjection: Module {
 
             let fInPadded = currentFForBlockInput + padFLeft + padFRight
             let fOutAfterConv = (fInPadded - kernelW) / strideW + 1
+
             calculatedFOutDims.append(fOutAfterConv)
             currentFForBlockInput = fOutAfterConv
         }
@@ -2389,8 +2376,8 @@ private class Gemma3nAudioAttention: Module {
     let attentionLogitsSoftCap: Float
     let contextSize: Int
     let qScale: Float
-    let localCausalValidMask: MLXArray
-    let softcap: MLXArray
+    private let _localCausalValidMask: MLXArray
+    private let _softcap: MLXArray
 
     @ModuleInfo(key: "relative_position_embedding") var relativePositionEmbedding:
         Gemma3nAudioRelativePositionEmbedding
@@ -2434,9 +2421,10 @@ private class Gemma3nAudioAttention: Module {
         )
 
         let localCausalValidMaskTemp = MLXArray.ones([chunkSize, contextSize], dtype: .bool)
-        self.localCausalValidMask = localCausalValidMaskTemp .&& lowerCausalMask .&& upperCausalMask
+        self._localCausalValidMask = localCausalValidMaskTemp .&& lowerCausalMask
+            .&& upperCausalMask
 
-        self.softcap = MLXArray(attentionLogitsSoftCap, dtype: .float32)
+        self._softcap = MLXArray(attentionLogitsSoftCap, dtype: .float32)
 
         super.init()
     }
@@ -2536,7 +2524,7 @@ private class Gemma3nAudioAttention: Module {
 
         let conditionFromCausality = expandedDimensions(
             expandedDimensions(
-                expandedDimensions(localCausalValidMask, axis: 0),
+                expandedDimensions(_localCausalValidMask, axis: 0),
                 axis: 0
             ),
             axis: 0
@@ -2547,9 +2535,9 @@ private class Gemma3nAudioAttention: Module {
         var logits = relativePositionEmbedding(queryBlocks, keyBlocks)
 
         // Apply attention logit softcap
-        logits = logits / softcap
+        logits = logits / _softcap
         logits = tanh(logits)
-        logits = logits * softcap
+        logits = logits * _softcap
 
         // Apply the combined mask
         logits = MLX.where(
@@ -2591,10 +2579,10 @@ private class Gemma3nAudioConformerAttention: Module {
     let postInFeatures: Int
     private let _gradientClipping: MLXArray
 
-    @ModuleInfo var preAttnNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "pre_attn_norm") var preAttnNorm: Gemma3nRMSNormWithScale
     @ModuleInfo var attn: Gemma3nAudioAttention
     @ModuleInfo var post: Linear
-    @ModuleInfo var postNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_norm") var postNorm: Gemma3nRMSNormWithScale
 
     init(config: AudioConfig) {
         self.config = config
@@ -2737,17 +2725,17 @@ private class Gemma3nAudioConformerLightConv1d: Module {
 // MARK: - Conformer Block
 private class Gemma3nAudioConformerBlock: Module {
     let config: AudioConfig
-    private let gradientClipping: MLXArray
+    private let _gradientClipping: MLXArray
 
-    @ModuleInfo var ffwLayerStart: Gemma3nAudioConformerFeedForward
+    @ModuleInfo(key: "ffw_layer_start") var ffwLayerStart: Gemma3nAudioConformerFeedForward
     @ModuleInfo var attention: Gemma3nAudioConformerAttention
     @ModuleInfo var lconv1d: Gemma3nAudioConformerLightConv1d
-    @ModuleInfo var ffwLayerEnd: Gemma3nAudioConformerFeedForward
+    @ModuleInfo(key: "ffw_layer_end") var ffwLayerEnd: Gemma3nAudioConformerFeedForward
     @ModuleInfo var norm: Gemma3nRMSNormWithScale
 
     init(config: AudioConfig) {
         self.config = config
-        self.gradientClipping = MLXArray(config.gradientClipping)
+        self._gradientClipping = MLXArray(config.gradientClipping)
 
         self._ffwLayerStart.wrappedValue = Gemma3nAudioConformerFeedForward(config: config)
         self._attention.wrappedValue = Gemma3nAudioConformerAttention(config: config)
@@ -2771,7 +2759,7 @@ private class Gemma3nAudioConformerBlock: Module {
 
         result = lconv1d(audioencodingsForLconvInput)
         result = ffwLayerEnd(result)
-        result = clip(result, min: -gradientClipping, max: gradientClipping)
+        result = clip(result, min: -_gradientClipping, max: _gradientClipping)
         return norm(result)
     }
 }
@@ -2856,7 +2844,8 @@ private func numGroups(groupSize: Int?, channels: Int) -> Int {
     }
     // NOTE: groupSize == 1 -> depthwise conv
     assert(channels % groupSize == 0)
-    return channels / groupSize
+    let groups = channels / groupSize
+    return groups
 }
 
 private func makeDivisible(
@@ -3082,6 +3071,7 @@ private class EdgeResidual: Module, UnaryLayer {
         self.hasSkip = (inChannels == outChannels && stride == 1) && !noskip
 
         let padding = (expKernelSize - 1) / 2
+
         self._convExp.wrappedValue = Conv2d(
             inputChannels: inChannels,
             outputChannels: midChannels,
@@ -3139,17 +3129,17 @@ private class MultiQueryAttention2d: Module {
     let valueDim: Int
     let scale: Float
 
-    @ModuleInfo var queryProj: Conv2d
+    @ModuleInfo(key: "query_proj") var queryProj: Conv2d
 
-    @ModuleInfo var keyDownConv: UnaryLayer
-    @ModuleInfo var keyNorm: UnaryLayer
-    @ModuleInfo var valueDownConv: UnaryLayer
-    @ModuleInfo var valueNorm: UnaryLayer
+    @ModuleInfo(key: "key_down_conv") var keyDownConv: UnaryLayer
+    @ModuleInfo(key: "key_norm") var keyNorm: UnaryLayer
+    @ModuleInfo(key: "value_down_conv") var valueDownConv: UnaryLayer
+    @ModuleInfo(key: "value_norm") var valueNorm: UnaryLayer
 
-    @ModuleInfo var keyProj: Conv2d
-    @ModuleInfo var valueProj: Conv2d
+    @ModuleInfo(key: "key_proj") var keyProj: Conv2d
+    @ModuleInfo(key: "value_proj") var valueProj: Conv2d
     @ModuleInfo(key: "attn_drop") var attnDrop: UnaryLayer
-    @ModuleInfo var outputProj: Conv2d
+    @ModuleInfo(key: "output_proj") var outputProj: Conv2d
     @ModuleInfo(key: "proj_drop") var projDrop: UnaryLayer
 
     init(
@@ -3195,6 +3185,7 @@ private class MultiQueryAttention2d: Module {
                 groups: dim,  // Depthwise
                 bias: false
             )
+
             self._keyNorm.wrappedValue = RMSNormAct2d(numChannels: dim, eps: 1e-6, applyAct: false)
         } else {
             self._keyDownConv.wrappedValue = Identity()
@@ -3323,8 +3314,8 @@ private class MobileAttention: Module, UnaryLayer {
 
     @ModuleInfo var norm: RMSNormAct2d
     @ModuleInfo var attn: MultiQueryAttention2d
-    @ModuleInfo var layerScale: UnaryLayer
-    @ModuleInfo var dropPath: Identity
+    @ModuleInfo(key: "layer_scale") var layerScale: UnaryLayer
+    @ModuleInfo(key: "drop_path") var dropPath: Identity
 
     init(
         inChannels: Int,
@@ -3544,7 +3535,7 @@ private class MobileNetV5MultiScaleFusionAdapter: Module {
 
     @ModuleInfo var ffn: UniversalInvertedResidual
     @ModuleInfo var norm: RMSNormAct2d
-    @ModuleInfo var avgPool: AvgPool2d
+    @ModuleInfo(key: "avg_pool") var avgPool: AvgPool2d
 
     init(
         inChannels: [Int],
@@ -3780,37 +3771,23 @@ private class Gemma3nVisionModel: Module {
     }
 
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
-        return Self.sanitizeWeights(weights)
-    }
-
-    static func sanitizeWeights(_ weights: [String: MLXArray]) -> [String: MLXArray] {
-        var sanitizedWeights = [String: MLXArray]()
+        var sanitizedWeights = weights
         var skipTranspose = false
-
-        // This logic is correct
         let testKey = "vision_tower.timm_model.blocks.0.0.conv_exp.weight"
-        if let convWeight = weights[testKey] {
-            let shape = convWeight.shape
-            if shape.count == 4, shape[3] > shape[1] {
-                skipTranspose = true
-            }
+        if let convWeight = weights[testKey], convWeight.ndim == 4,
+            convWeight.shape[3] > convWeight.shape[1]
+        {
+            skipTranspose = true
         }
-
         for (k, v) in weights {
             if (k.contains("conv") && k.contains("weight"))
                 || (k.contains("attn") && k.contains("proj.weight"))
             {
-                if v.shape.count == 4 && !skipTranspose {
+                if v.ndim == 4 && !skipTranspose {
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
-                } else {
-                    sanitizedWeights[k] = v
                 }
-            } else {
-                // Copy all other weights (biases, norm layers, etc.)
-                sanitizedWeights[k] = v
             }
         }
-
         return sanitizedWeights
     }
 }
@@ -3828,8 +3805,9 @@ private class Gemma3nAudioModel: Module {
 
         self._subsampleConvProjection.wrappedValue = Gemma3nAudioSubSampleConvProjection(
             config: config)
-        self._conformer.wrappedValue = (0 ..< config.confNumHiddenLayers).map { _ in
-            Gemma3nAudioConformerBlock(config: config)
+
+        self._conformer.wrappedValue = (0 ..< config.confNumHiddenLayers).map { i in
+            return Gemma3nAudioConformerBlock(config: config)
         }
 
         super.init()
@@ -3914,32 +3892,25 @@ private class Gemma3nAudioModel: Module {
     /// Sanitizes weights by transposing convolution layers if they are not
     /// already in the expected MLX format.
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
-        var sanitizedWeights = [String: MLXArray]()
-
+        var sanitizedWeights = weights
+        // Iterate over the original keys to decide which ones to modify in the copy.
         for (k, v) in weights {
             if k.contains("conv.weight") {
-                // A Conv2D weight should be 4D.
-                // If it is, check if it needs transposing from NCHW to NHWC.
-                // If checkArrayShape is true, it's already in the correct format.
-                if v.ndim == 4 && !checkArrayShape(v) {
-                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
-                } else {
+                if checkArrayShape(v) {
                     sanitizedWeights[k] = v
+                } else {
+                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 }
             } else if k.contains("conv1d.weight") {
-                // A Conv1D weight should be 3D.
-                // If it is, check if it needs transposing from NCL to NLC.
-                if v.ndim == 3 && !checkArrayShape(v) {
-                    sanitizedWeights[k] = v.transposed(0, 2, 1)
-                } else {
+                if true {
                     sanitizedWeights[k] = v
+                } else {
+                    sanitizedWeights[k] = v.transposed(0, 2, 1)
                 }
             } else {
-                // For all other weights, keep them as they are.
                 sanitizedWeights[k] = v
             }
         }
-
         return sanitizedWeights
     }
 }

From 02767d5b5971db5abe69be39b76d560d2dd8a6f5 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 16:51:23 +0200
Subject: [PATCH 14/19] Do all sanitization steps on load

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 49 +++------------------------
 1 file changed, 4 insertions(+), 45 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 892b5d8a..d0b2dcfe 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1896,53 +1896,12 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
                 sanitizedWeights[k] = v
             }
         }
+        sanitizedWeights = visionTower.sanitize(weights: sanitizedWeights)
+        // TODO: The audio and language sanitization is not done in the Python implementation. Is this needed?
+        sanitizedWeights = audioTower.sanitize(weights: sanitizedWeights)
+        sanitizedWeights = languageModel.sanitize(weights: sanitizedWeights)
         return sanitizedWeights
     }
-
-    public static func fromPretrained(pathOrHfRepo: String) throws -> Gemma3n {
-        let path = URL(fileURLWithPath: pathOrHfRepo)
-
-        let configPath = path.appendingPathComponent("config.json")
-        let configData = try Data(contentsOf: configPath)
-
-        let decoder = JSONDecoder()
-        decoder.keyDecodingStrategy = .convertFromSnakeCase
-        let modelConfig = try decoder.decode(ModelConfig.self, from: configData)
-
-        let model = Gemma3n(modelConfig)
-
-        // Load all weight files into a single dictionary
-        let weightFiles = try FileManager.default.contentsOfDirectory(atPath: path.path)
-            .filter { $0.hasSuffix(".safetensors") }
-        guard !weightFiles.isEmpty else {
-            throw NSError(
-                domain: "ModelLoading", code: 1,
-                userInfo: [NSLocalizedDescriptionKey: "No safetensors found in \(path.path)"])
-        }
-
-        var weights = [String: MLXArray]()
-        for weightFile in weightFiles {
-            let fileWeights = try loadArrays(url: path.appendingPathComponent(weightFile))
-            weights.merge(fileWeights) { _, new in new }
-        }
-
-        var sanitizedWeights = model.sanitize(weights: weights)
-        sanitizedWeights = model.visionTower.sanitize(weights: sanitizedWeights)
-        // The audio and language sanitization is not done in the Python implementation
-        //        sanitizedWeights = model.audioTower.sanitize(weights: sanitizedWeights)
-        //        sanitizedWeights = model.languageModel.sanitize(weights: sanitizedWeights)
-
-        // Handle tied lm_head weights
-        if sanitizedWeights["language_model.lm_head.weight"] == nil {
-            if let embedWeight = sanitizedWeights["language_model.model.embed_tokens.weight"] {
-                sanitizedWeights["language_model.lm_head.weight"] = embedWeight
-            }
-        }
-
-        // Load the weights
-        try model.update(parameters: ModuleParameters.unflattened(sanitizedWeights), verify: [.all])
-        return model
-    }
 }
 
 // MARK: - Audio Model Components

From e4018c7d7e753bdea5af02129c804b88f4f4d154 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 20:28:54 +0200
Subject: [PATCH 15/19] Clean up Gemma3nRMSNorm

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 175 +++++++++++---------------
 1 file changed, 70 insertions(+), 105 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index d0b2dcfe..4c266d6d 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -386,46 +386,32 @@ public struct ModelConfig: Codable, Sendable {
 
 // MARK: - Language Model Components
 
-// Base protocol for RMSNorm variants
-private protocol Gemma3nRMSNormProtocol: UnaryLayer {
-    func callAsFunction(_ x: MLXArray) -> MLXArray
-}
-
-// RMSNorm with scale parameter
-private class Gemma3nRMSNormWithScale: Module, Gemma3nRMSNormProtocol {
+private class Gemma3nRMSNorm: Module {
     let eps: Float
     let scaleShift: Float
-    @ModuleInfo var weight: MLXArray
+    @ModuleInfo var weight: MLXArray?
 
-    init(dim: Int, eps: Float = 1e-6, scaleShift: Float = 0.0) {
+    init(dim: Int, eps: Float = 1e-6, scaleShift: Float = 0, withScale: Bool = true) {
         self.eps = eps
         self.scaleShift = scaleShift
-        self._weight.wrappedValue = MLXArray.ones([dim])
-        super.init()
-    }
-
-    func callAsFunction(_ x: MLXArray) -> MLXArray {
-        let output = norm(x.asType(.float32))
-        return (output * (weight + scaleShift)).asType(x.dtype)
-    }
-
-    private func norm(_ x: MLXArray) -> MLXArray {
-        return x * rsqrt(x.square().mean(axis: -1, keepDims: true) + eps)
-    }
-}
 
-// RMSNorm without scale parameter (no weight to load from checkpoint)
-private class Gemma3nRMSNormNoScale: Module, Gemma3nRMSNormProtocol {
-    let eps: Float
+        if withScale {
+            self.weight = MLXArray.ones([dim])
+        } else {
+            self.weight = nil
+        }
 
-    init(dim: Int, eps: Float = 1e-6) {
-        self.eps = eps
         super.init()
     }
 
     func callAsFunction(_ x: MLXArray) -> MLXArray {
         let output = norm(x.asType(.float32))
-        return output.asType(x.dtype)
+
+        if let weight {
+            return (output * (weight + scaleShift)).asType(x.dtype)
+        } else {
+            return output.asType(x.dtype)
+        }
     }
 
     private func norm(_ x: MLXArray) -> MLXArray {
@@ -433,32 +419,17 @@ private class Gemma3nRMSNormNoScale: Module, Gemma3nRMSNormProtocol {
     }
 }
 
-// Factory function to create the appropriate RMSNorm variant
-private func createGemma3nRMSNorm(
-    dim: Int,
-    eps: Float = 1e-6,
-    scaleShift: Float = 0.0,
-    withScale: Bool = true
-) -> any Gemma3nRMSNormProtocol {
-    if withScale {
-        return Gemma3nRMSNormWithScale(dim: dim, eps: eps, scaleShift: scaleShift)
-    } else {
-        return Gemma3nRMSNormNoScale(dim: dim, eps: eps)
-    }
-}
-
 private class Gemma3nLaurelBlock: Module {
     @ModuleInfo(key: "linear_left") var linearLeft: Linear
     @ModuleInfo(key: "linear_right") var linearRight: Linear
-    @ModuleInfo(key: "post_laurel_norm") var postLaurelNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_laurel_norm") var postLaurelNorm: Gemma3nRMSNorm
 
     init(config: TextConfig) {
         self._linearLeft.wrappedValue = Linear(config.hiddenSize, config.laurelRank, bias: false)
         self._linearRight.wrappedValue = Linear(config.laurelRank, config.hiddenSize, bias: false)
-        self._postLaurelNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._postLaurelNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
         super.init()
     }
@@ -495,8 +466,8 @@ private class Gemma3nRotaryEmbedding: Module {
     let originalMaxSeqLen: Int
     let config: TextConfig
     let attentionScaling: Float
-    let _invFreq: MLXArray
-    let _originalInvFreq: MLXArray
+    private let _invFreq: MLXArray
+    private let _originalInvFreq: MLXArray
 
     init(config: TextConfig) {
         if let ropeScaling = config.ropeScaling {
@@ -570,9 +541,9 @@ private class Gemma3nAttention: Module {
     @ModuleInfo(key: "k_proj") var kProj: Linear
     @ModuleInfo(key: "v_proj") var vProj: Linear
     @ModuleInfo(key: "o_proj") var oProj: Linear
-    @ModuleInfo(key: "q_norm") var qNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "k_norm") var kNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "v_norm") var vNorm: Gemma3nRMSNormNoScale
+    @ModuleInfo(key: "q_norm") var qNorm: Gemma3nRMSNorm
+    @ModuleInfo(key: "k_norm") var kNorm: Gemma3nRMSNorm
+    @ModuleInfo(key: "v_norm") var vNorm: Gemma3nRMSNorm
 
     init(config: TextConfig, layerIdx: Int) {
         self.isSliding =
@@ -594,13 +565,14 @@ private class Gemma3nAttention: Module {
         self._vProj.wrappedValue = Linear(dim, numKVHeads * headDim, bias: false)
         self._oProj.wrappedValue = Linear(numHeads * headDim, dim, bias: false)
 
-        self._qNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._qNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.headDim, eps: config.rmsNormEps)
-        self._kNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._kNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.headDim, eps: config.rmsNormEps)
-        self._vNorm.wrappedValue = Gemma3nRMSNormNoScale(
+        self._vNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.headDim,
-            eps: config.rmsNormEps
+            eps: config.rmsNormEps,
+            withScale: false
         )
 
         let firstKvSharedLayerIdx = config.numHiddenLayers - config.numKvSharedLayers
@@ -749,8 +721,8 @@ private class Gemma3nAltUp: Module {
     @ModuleInfo(key: "correction_coefs") var correctionCoefs: Linear
     @ModuleInfo(key: "prediction_coefs") var predictionCoefs: Linear
     @ModuleInfo(key: "modality_router") var modalityRouter: Linear
-    @ModuleInfo(key: "router_norm") var routerNorm: Gemma3nRMSNormWithScale
-    let _routerInputScale: MLXArray
+    @ModuleInfo(key: "router_norm") var routerNorm: Gemma3nRMSNorm
+    private let _routerInputScale: MLXArray
 
     let config: TextConfig
 
@@ -773,10 +745,9 @@ private class Gemma3nAltUp: Module {
             config.altupNumInputs,
             bias: false
         )
-        self._routerNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._routerNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
         self._routerInputScale = MLXArray(pow(Float(config.hiddenSize), -1.0))
 
@@ -784,8 +755,11 @@ private class Gemma3nAltUp: Module {
     }
 
     func computeRouterModalities(_ x: MLXArray) -> MLXArray {
-        let routerInputs =
-            routerNorm(x) * _routerInputScale.asType(routerNorm.weight.dtype)
+        guard let routerNormWeight = routerNorm.weight else {
+            fatalError("routerNorm.weight is nil in Gemma3nAltUp")
+        }
+        let routerInputs = routerNorm(x) * _routerInputScale.asType(routerNormWeight.dtype)
+
         let routed = modalityRouter(routerInputs).asType(.float32)
         return tanh(routed)
     }
@@ -875,17 +849,15 @@ private class Gemma3nDecoderLayer: Module {
 
     @ModuleInfo(key: "self_attn") var selfAttn: Gemma3nAttention
     @ModuleInfo var mlp: MLP
-    @ModuleInfo(key: "input_layernorm") var inputLayernorm: Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayernorm: Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayernorm:
-        Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayernorm:
-        Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "input_layernorm") var inputLayernorm: Gemma3nRMSNorm
+    @ModuleInfo(key: "post_attention_layernorm") var postAttentionLayernorm: Gemma3nRMSNorm
+    @ModuleInfo(key: "pre_feedforward_layernorm") var preFeedforwardLayernorm: Gemma3nRMSNorm
+    @ModuleInfo(key: "post_feedforward_layernorm") var postFeedforwardLayernorm: Gemma3nRMSNorm
     @ModuleInfo var altup: Gemma3nAltUp
     @ModuleInfo var laurel: Gemma3nLaurelBlock
     @ModuleInfo(key: "per_layer_input_gate") var perLayerInputGate: Linear
     @ModuleInfo(key: "per_layer_projection") var perLayerProjection: Linear
-    @ModuleInfo(key: "post_per_layer_input_norm") var postPerLayerInputNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_per_layer_input_norm") var postPerLayerInputNorm: Gemma3nRMSNorm
 
     init(config: TextConfig, layerIdx: Int) {
         self.config = config
@@ -901,26 +873,22 @@ private class Gemma3nDecoderLayer: Module {
             == "sliding_attention"
 
         self._mlp.wrappedValue = MLP(config: config, layerIdx: layerIdx)
-        self._inputLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._inputLayernorm.wrappedValue = Gemma3nRMSNorm(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
 
-        self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNorm(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
-        self._preFeedforwardLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._preFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
-        self._postFeedforwardLayernorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._postFeedforwardLayernorm.wrappedValue = Gemma3nRMSNorm(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
 
         self._altup.wrappedValue = Gemma3nAltUp(config: config)
@@ -936,10 +904,9 @@ private class Gemma3nDecoderLayer: Module {
             hiddenSize,
             bias: false
         )
-        self._postPerLayerInputNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._postPerLayerInputNorm.wrappedValue = Gemma3nRMSNorm(
             dim: hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
 
         super.init()
@@ -1049,13 +1016,12 @@ private class Gemma3Model: Module {
     @ModuleInfo(key: "layers") var layers: [Gemma3nDecoderLayer]
     @ModuleInfo(key: "embed_tokens_per_layer") var embedTokensPerLayer: Embedding
     @ModuleInfo(key: "per_layer_model_projection") var perLayerModelProjection: Linear
-    @ModuleInfo(key: "per_layer_projection_norm") var perLayerProjectionNorm:
-        Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "per_layer_projection_norm") var perLayerProjectionNorm: Gemma3nRMSNorm
 
     @ModuleInfo(key: "altup_projections") var altupProjections: [Linear]
     @ModuleInfo(key: "altup_unembed_projections") var altupUnembedProjections: [Linear]
 
-    @ModuleInfo var norm: Gemma3nRMSNormWithScale
+    @ModuleInfo var norm: Gemma3nRMSNorm
     @ModuleInfo(key: "rope_embedding") var ropeEmbedding: Gemma3nRotaryEmbedding
     @ModuleInfo(key: "rope_embedding_local") var ropeEmbeddingLocal: Gemma3nRotaryEmbedding
 
@@ -1090,10 +1056,9 @@ private class Gemma3Model: Module {
             bias: false
         )
 
-        self._perLayerProjectionNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._perLayerProjectionNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSizePerLayerInput,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
 
         self._altupProjections.wrappedValue = (0 ..< (config.altupNumInputs - 1)).map { _ in
@@ -1103,10 +1068,9 @@ private class Gemma3Model: Module {
             Linear(config.hiddenSize, config.hiddenSize, bias: false)
         }
 
-        self._norm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._norm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
             eps: config.rmsNormEps,
-            scaleShift: 0.0
         )
 
         self._perLayerProjectionScale = MLXArray(pow(Float(hiddenSize), -0.5))
@@ -1375,11 +1339,11 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
     let textHiddenSize: Int
 
     @ModuleInfo var embedding: Embedding
-    @ModuleInfo(key: "hard_embedding_norm") var hardEmbeddingNorm: Gemma3nRMSNormWithScale
-    @ModuleInfo(key: "soft_embedding_norm") var softEmbeddingNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "hard_embedding_norm") var hardEmbeddingNorm: Gemma3nRMSNorm
+    @ModuleInfo(key: "soft_embedding_norm") var softEmbeddingNorm: Gemma3nRMSNorm
     @ModuleInfo(key: "embedding_projection") var embeddingProjection: Linear
     @ModuleInfo(key: "embedding_post_projection_norm") var embeddingPostProjectionNorm:
-        Gemma3nRMSNormNoScale
+        Gemma3nRMSNorm
 
     init(multimodalConfig: any MultimodalConfig, textConfig: TextConfig) {
         self.multimodalHiddenSize = multimodalConfig.hiddenSize
@@ -1392,11 +1356,11 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
             embeddingCount: vocabSize,
             dimensions: multimodalHiddenSize
         )
-        self._hardEmbeddingNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._hardEmbeddingNorm.wrappedValue = Gemma3nRMSNorm(
             dim: multimodalHiddenSize,
             eps: eps
         )
-        self._softEmbeddingNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._softEmbeddingNorm.wrappedValue = Gemma3nRMSNorm(
             dim: multimodalHiddenSize,
             eps: eps
         )
@@ -1405,9 +1369,10 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
             textHiddenSize,
             bias: false
         )
-        self._embeddingPostProjectionNorm.wrappedValue = Gemma3nRMSNormNoScale(
+        self._embeddingPostProjectionNorm.wrappedValue = Gemma3nRMSNorm(
             dim: textHiddenSize,
-            eps: eps
+            eps: eps,
+            withScale: false
         )
 
         super.init()
@@ -2538,10 +2503,10 @@ private class Gemma3nAudioConformerAttention: Module {
     let postInFeatures: Int
     private let _gradientClipping: MLXArray
 
-    @ModuleInfo(key: "pre_attn_norm") var preAttnNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "pre_attn_norm") var preAttnNorm: Gemma3nRMSNorm
     @ModuleInfo var attn: Gemma3nAudioAttention
     @ModuleInfo var post: Linear
-    @ModuleInfo(key: "post_norm") var postNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_norm") var postNorm: Gemma3nRMSNorm
 
     init(config: AudioConfig) {
         self.config = config
@@ -2549,10 +2514,10 @@ private class Gemma3nAudioConformerAttention: Module {
         self.postInFeatures = config.hiddenSize
         self._gradientClipping = MLXArray(config.gradientClipping)
 
-        self._preAttnNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
+        self._preAttnNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
         self._attn.wrappedValue = Gemma3nAudioAttention(config: config)
         self._post.wrappedValue = Linear(postInFeatures, config.hiddenSize, bias: false)
-        self._postNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
+        self._postNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
 
         super.init()
     }
@@ -2581,20 +2546,20 @@ private class Gemma3nAudioConformerFeedForward: Module {
     private let _gradientClipping: MLXArray
     private let _postLayerScale: MLXArray
 
-    @ModuleInfo(key: "pre_layer_norm") var preLayerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "pre_layer_norm") var preLayerNorm: Gemma3nRMSNorm
     @ModuleInfo(key: "ffw_layer_1") var ffwLayer1: Linear
     @ModuleInfo(key: "ffw_layer_2") var ffwLayer2: Linear
-    @ModuleInfo(key: "post_layer_norm") var postLayerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "post_layer_norm") var postLayerNorm: Gemma3nRMSNorm
 
     init(config: AudioConfig) {
         self.config = config
         self._gradientClipping = MLXArray(config.gradientClipping)
         self._postLayerScale = MLXArray(config.confResidualWeight)
 
-        self._preLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
+        self._preLayerNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
         self._ffwLayer1.wrappedValue = Linear(config.hiddenSize, config.hiddenSize * 4, bias: false)
         self._ffwLayer2.wrappedValue = Linear(config.hiddenSize * 4, config.hiddenSize, bias: false)
-        self._postLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
+        self._postLayerNorm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
 
         super.init()
     }
@@ -2618,10 +2583,10 @@ private class Gemma3nAudioConformerLightConv1d: Module {
     private let _gradientClipping: MLXArray
     let causalPadding: Int
 
-    @ModuleInfo(key: "pre_layer_norm") var preLayerNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "pre_layer_norm") var preLayerNorm: Gemma3nRMSNorm
     @ModuleInfo(key: "linear_start") var linearStart: Linear
     @ModuleInfo(key: "depthwise_conv1d") var depthwiseConv1d: Conv1d
-    @ModuleInfo(key: "conv_norm") var convNorm: Gemma3nRMSNormWithScale
+    @ModuleInfo(key: "conv_norm") var convNorm: Gemma3nRMSNorm
     @ModuleInfo(key: "linear_end") var linearEnd: Linear
 
     init(config: AudioConfig) {
@@ -2629,7 +2594,7 @@ private class Gemma3nAudioConformerLightConv1d: Module {
         self._gradientClipping = MLXArray(config.gradientClipping)
         self.causalPadding = config.confConvKernelSize - 1
 
-        self._preLayerNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._preLayerNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
             eps: config.rmsNormEps
         )
@@ -2647,7 +2612,7 @@ private class Gemma3nAudioConformerLightConv1d: Module {
             groups: config.hiddenSize,
             bias: false
         )
-        self._convNorm.wrappedValue = Gemma3nRMSNormWithScale(
+        self._convNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
             eps: config.rmsNormEps
         )
@@ -2690,7 +2655,7 @@ private class Gemma3nAudioConformerBlock: Module {
     @ModuleInfo var attention: Gemma3nAudioConformerAttention
     @ModuleInfo var lconv1d: Gemma3nAudioConformerLightConv1d
     @ModuleInfo(key: "ffw_layer_end") var ffwLayerEnd: Gemma3nAudioConformerFeedForward
-    @ModuleInfo var norm: Gemma3nRMSNormWithScale
+    @ModuleInfo var norm: Gemma3nRMSNorm
 
     init(config: AudioConfig) {
         self.config = config
@@ -2700,7 +2665,7 @@ private class Gemma3nAudioConformerBlock: Module {
         self._attention.wrappedValue = Gemma3nAudioConformerAttention(config: config)
         self._lconv1d.wrappedValue = Gemma3nAudioConformerLightConv1d(config: config)
         self._ffwLayerEnd.wrappedValue = Gemma3nAudioConformerFeedForward(config: config)
-        self._norm.wrappedValue = Gemma3nRMSNormWithScale(dim: config.hiddenSize)
+        self._norm.wrappedValue = Gemma3nRMSNorm(dim: config.hiddenSize)
 
         super.init()
     }

From a5a3c1cc538647ed1a14386bed05073596591a77 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 21:49:00 +0200
Subject: [PATCH 16/19] Add TODO for custom Metal kernel

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 4c266d6d..dbe5518b 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -1474,12 +1474,10 @@ private func gemma3nAttentionWithCacheUpdate(
 private func bicubicInterpolate(
     _ x: MLXArray, to targetSize: (Int, Int), alignCorners: Bool = false
 ) -> MLXArray {
-    // TODO: This implementation uses nested loops and sequential MLX operations, which is much slower
+    // This implementation uses nested loops and sequential MLX operations, which is much slower
     // than the Python version that uses mx.fast.metal_kernel() for parallel GPU computation.
-    // MLX Swift currently doesn't have custom Metal kernel creation capabilities like Python's
-    // mx.fast.metal_kernel(). Consider optimizing with vectorized MLX operations or requesting
-    // custom kernel support from the MLX Swift team for better performance.
-
+    // TODO: Port the custom Metal kernel from Python to Swift using `MLXFast.metalKernel`.
+    //
     // Input: NHWC format [batch, height, width, channels]
     // Output: NHWC format [batch, target_height, target_width, channels]
 

From 52fb5cf314a01e40dc34ebeeded75fe14834fb4a Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Sat, 28 Jun 2025 22:21:56 +0200
Subject: [PATCH 17/19] Fixing sanitization, but too complicated

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 484 ++++++++++++++++++++------
 1 file changed, 381 insertions(+), 103 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index dbe5518b..322a55bf 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -23,53 +23,96 @@ public protocol MultimodalConfig {
 }
 
 public struct AudioConfig: Codable, Sendable, MultimodalConfig {
-    // Constants with default values (always present)
-    public let inputFeatSize: Int = 80
-    public let hiddenSize: Int = 1536
-    public let confAttentionChunkSize: Int = 12
-    public let confAttentionContextLeft: Int = 13
-    public let confAttentionContextRight: Int = 0
-    public let confAttentionInvalidLogitsValue: Float = -1e9
-    public let confAttentionLogitCap: Float = 50.0
-    public let confNumAttentionHeads: Int = 8
-    public let confNumHiddenLayers: Int = 12
-    public let confConvKernelSize: Int = 5
-    public let confPositionalBiasSize: Int = 256
-    public let confReductionFactor: Int = 4
-    public let confResidualWeight: Float = 0.5
-    public let sscpConvChannelSize: [Int] = [128, 32]
-    public let sscpConvGroupNormEps: Float = 1e-3
-    public let sscpConvKernelSize: [[Int]] = [[3, 3], [3, 3]]
-    public let sscpConvStrideSize: [[Int]] = [[2, 2], [2, 2]]
-    public let vocabSize: Int = 128
-    public let sscpConvEps: Float = 1e-3
-    public let rmsNormEps: Float = 1e-6
-    public let gradientClipping: Float = 10000000000.0
-    public let vocabOffset: Int = 262272  // 262_144 + 128 (text vocab size + vision vocab size)
+    // Use private properties with defaults, and computed properties to access them
+    private let _inputFeatSize: Int?
+    private let _hiddenSize: Int?
+    private let _confAttentionChunkSize: Int?
+    private let _confAttentionContextLeft: Int?
+    private let _confAttentionContextRight: Int?
+    private let _confAttentionInvalidLogitsValue: Float?
+    private let _confAttentionLogitCap: Float?
+    private let _confNumAttentionHeads: Int?
+    private let _confNumHiddenLayers: Int?
+    private let _confConvKernelSize: Int?
+    private let _confPositionalBiasSize: Int?
+    private let _confReductionFactor: Int?
+    private let _confResidualWeight: Float?
+    private let _sscpConvChannelSize: [Int]?
+    private let _sscpConvGroupNormEps: Float?
+    private let _sscpConvKernelSize: [[Int]]?
+    private let _sscpConvStrideSize: [[Int]]?
+    private let _vocabSize: Int?
+    private let _sscpConvEps: Float?
+    private let _rmsNormEps: Float?
+    private let _gradientClipping: Float?
+    private let _vocabOffset: Int?
+
+    // Computed properties with defaults
+    public var inputFeatSize: Int {
+        let value = _inputFeatSize ?? 80
+        // AudioConfig loading from JSON works correctly
+        return value
+    }
+    
+    public var hiddenSize: Int {
+        _hiddenSize ?? 1536
+    }
+    
+    public var confAttentionChunkSize: Int { _confAttentionChunkSize ?? 12 }
+    public var confAttentionContextLeft: Int { _confAttentionContextLeft ?? 13 }
+    public var confAttentionContextRight: Int { _confAttentionContextRight ?? 0 }
+    public var confAttentionInvalidLogitsValue: Float { _confAttentionInvalidLogitsValue ?? -1e9 }
+    public var confAttentionLogitCap: Float { _confAttentionLogitCap ?? 50.0 }
+    public var confNumAttentionHeads: Int { _confNumAttentionHeads ?? 8 }
+    public var confNumHiddenLayers: Int { _confNumHiddenLayers ?? 12 }
+    public var confConvKernelSize: Int { _confConvKernelSize ?? 5 }
+    public var confPositionalBiasSize: Int { _confPositionalBiasSize ?? 256 }
+    public var confReductionFactor: Int { _confReductionFactor ?? 4 }
+    public var confResidualWeight: Float { _confResidualWeight ?? 0.5 }
+    
+    public var sscpConvChannelSize: [Int] {
+        _sscpConvChannelSize ?? [128, 32]
+    }
+    
+    public var sscpConvGroupNormEps: Float { _sscpConvGroupNormEps ?? 1e-3 }
+    
+    public var sscpConvKernelSize: [[Int]] {
+        _sscpConvKernelSize ?? [[3, 3], [3, 3]]
+    }
+    
+    public var sscpConvStrideSize: [[Int]] {
+        _sscpConvStrideSize ?? [[2, 2], [2, 2]]
+    }
+    
+    public var vocabSize: Int { _vocabSize ?? 128 }
+    public var sscpConvEps: Float { _sscpConvEps ?? 1e-3 }
+    public var rmsNormEps: Float { _rmsNormEps ?? 1e-6 }
+    public var gradientClipping: Float { _gradientClipping ?? 10000000000.0 }
+    public var vocabOffset: Int { _vocabOffset ?? 262272 }
 
     enum CodingKeys: String, CodingKey {
-        case inputFeatSize = "input_feat_size"
-        case hiddenSize = "hidden_size"
-        case confAttentionChunkSize = "conf_attention_chunk_size"
-        case confAttentionContextLeft = "conf_attention_context_left"
-        case confAttentionContextRight = "conf_attention_context_right"
-        case confAttentionInvalidLogitsValue = "conf_attention_invalid_logits_value"
-        case confAttentionLogitCap = "conf_attention_logit_cap"
-        case confNumAttentionHeads = "conf_num_attention_heads"
-        case confNumHiddenLayers = "conf_num_hidden_layers"
-        case confConvKernelSize = "conf_conv_kernel_size"
-        case confPositionalBiasSize = "conf_positional_bias_size"
-        case confReductionFactor = "conf_reduction_factor"
-        case confResidualWeight = "conf_residual_weight"
-        case sscpConvChannelSize = "sscp_conv_channel_size"
-        case sscpConvGroupNormEps = "sscp_conv_group_norm_eps"
-        case sscpConvKernelSize = "sscp_conv_kernel_size"
-        case sscpConvStrideSize = "sscp_conv_stride_size"
-        case vocabSize = "vocab_size"
-        case sscpConvEps = "sscp_conv_eps"
-        case rmsNormEps = "rms_norm_eps"
-        case gradientClipping = "gradient_clipping"
-        case vocabOffset = "vocab_offset"
+        case _inputFeatSize = "input_feat_size"
+        case _hiddenSize = "hidden_size"
+        case _confAttentionChunkSize = "conf_attention_chunk_size"
+        case _confAttentionContextLeft = "conf_attention_context_left"
+        case _confAttentionContextRight = "conf_attention_context_right"
+        case _confAttentionInvalidLogitsValue = "conf_attention_invalid_logits_value"
+        case _confAttentionLogitCap = "conf_attention_logit_cap"
+        case _confNumAttentionHeads = "conf_num_attention_heads"
+        case _confNumHiddenLayers = "conf_num_hidden_layers"
+        case _confConvKernelSize = "conf_conv_kernel_size"
+        case _confPositionalBiasSize = "conf_positional_bias_size"
+        case _confReductionFactor = "conf_reduction_factor"
+        case _confResidualWeight = "conf_residual_weight"
+        case _sscpConvChannelSize = "sscp_conv_channel_size"
+        case _sscpConvGroupNormEps = "sscp_conv_group_norm_eps"
+        case _sscpConvKernelSize = "sscp_conv_kernel_size"
+        case _sscpConvStrideSize = "sscp_conv_stride_size"
+        case _vocabSize = "vocab_size"
+        case _sscpConvEps = "sscp_conv_eps"
+        case _rmsNormEps = "rms_norm_eps"
+        case _gradientClipping = "gradient_clipping"
+        case _vocabOffset = "vocab_offset"
     }
 }
 
@@ -1630,6 +1673,8 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
     public init(_ config: ModelConfig) {
         self.config = config
 
+        // Audio config loaded successfully from JSON
+
         self._languageModel.wrappedValue = LanguageModel(config: config.textConfig)
         self._visionTower.wrappedValue = Gemma3nVisionModel(config: config.visionConfig)
         self._audioTower.wrappedValue = Gemma3nAudioModel(config: config.audioConfig)
@@ -1859,10 +1904,14 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
                 sanitizedWeights[k] = v
             }
         }
+        
+        // CORE ISSUE: VisionTower can't load weights with blocks.blocks.X keys into @ModuleInfo var blocks: [UnaryLayer]
+        // All weight processing (format, transpose, dimension swap) works correctly
+        // Only remaining issue: MLX expects blocks.X keys but weights have blocks.blocks.X structure
         sanitizedWeights = visionTower.sanitize(weights: sanitizedWeights)
-        // TODO: The audio and language sanitization is not done in the Python implementation. Is this needed?
         sanitizedWeights = audioTower.sanitize(weights: sanitizedWeights)
         sanitizedWeights = languageModel.sanitize(weights: sanitizedWeights)
+        
         return sanitizedWeights
     }
 }
@@ -2218,6 +2267,8 @@ private class Gemma3nAudioSubSampleConvProjection: Module {
         var calculatedBlockPadding: [[Int]] = []
         var calculatedFOutDims: [Int] = []
 
+        // Audio SSCP initialized with config values
+
         for i in 0 ..< 2 {
             let (kernelH, kernelW) = (
                 config.sscpConvKernelSize[i][0], config.sscpConvKernelSize[i][1]
@@ -2237,6 +2288,8 @@ private class Gemma3nAudioSubSampleConvProjection: Module {
             let fInPadded = currentFForBlockInput + padFLeft + padFRight
             let fOutAfterConv = (fInPadded - kernelW) / strideW + 1
 
+            // SSCP convolution \(i) configured
+
             calculatedFOutDims.append(fOutAfterConv)
             currentFForBlockInput = fOutAfterConv
         }
@@ -2259,6 +2312,8 @@ private class Gemma3nAudioSubSampleConvProjection: Module {
         let finalFOut = calculatedFOutDims.last!
         self.inputProjInFeatures = finalCOut * finalFOut
 
+        // Audio SSCP dimensions calculated successfully
+
         self._inputProjLinear.wrappedValue = Linear(
             inputProjInFeatures,
             config.hiddenSize,
@@ -2760,14 +2815,12 @@ private class RMSNormAct2d: Module, UnaryLayer {
 }
 
 // MARK: - Helper Functions
+// Simplified to match Python implementation
 private func numGroups(groupSize: Int?, channels: Int) -> Int {
     guard let groupSize = groupSize, groupSize > 0 else {
-        return 1  // normal conv with 1 group
+        return 1
     }
-    // NOTE: groupSize == 1 -> depthwise conv
-    assert(channels % groupSize == 0)
-    let groups = channels / groupSize
-    return groups
+    return max(1, channels / groupSize)
 }
 
 private func makeDivisible(
@@ -2812,6 +2865,8 @@ private class ConvNormAct: Module, UnaryLayer {
     ) {
         self.outChannels = outChannels
 
+        // ConvNormAct initialized
+
         self._conv.wrappedValue = Conv2d(
             inputChannels: inChannels,
             outputChannels: outChannels,
@@ -2864,6 +2919,8 @@ private class UniversalInvertedResidual: Module, UnaryLayer {
         dropPathRate: Float = 0.0,
         layerScaleInitValue: Float? = 1e-5
     ) {
+        // UniversalInvertedResidual initialized
+        
         self.hasSkip = (inChannels == outChannels && stride == 1) && !noskip
 
         if stride > 1 {
@@ -2874,6 +2931,7 @@ private class UniversalInvertedResidual: Module, UnaryLayer {
         if dwKernelSizeStart > 0 {
             let dwStartStride = dwKernelSizeMid > 0 ? 1 : stride
             let dwStartGroups = numGroups(groupSize: groupSize, channels: inChannels)
+            // Depthwise start convolution
             self._dwStart.wrappedValue = ConvNormAct(
                 inChannels: inChannels,
                 outChannels: inChannels,
@@ -2892,6 +2950,7 @@ private class UniversalInvertedResidual: Module, UnaryLayer {
 
         // PW Expansion
         let midChannels = makeDivisible(Int(Float(inChannels) * expRatio))
+        // Pointwise expansion
         self._pwExp.wrappedValue = ConvNormAct(
             inChannels: inChannels,
             outChannels: midChannels,
@@ -2906,6 +2965,7 @@ private class UniversalInvertedResidual: Module, UnaryLayer {
         // DW Mid
         if dwKernelSizeMid > 0 {
             let dwMidGroups = numGroups(groupSize: groupSize, channels: midChannels)
+            // Depthwise mid convolution
             self._dwMid.wrappedValue = ConvNormAct(
                 inChannels: midChannels,
                 outChannels: midChannels,
@@ -3556,16 +3616,17 @@ private class MobileNetV5MultiScaleFusionAdapter: Module {
     }
 }
 
-// MARK: - Vision Tower
+// MARK: - Vision Tower - Flatten blocks to 1D array for @ModuleInfo compatibility
 private class VisionTower: Module {
     @ModuleInfo(key: "conv_stem") var convStem: ConvNormAct
-    @ModuleInfo var blocks: [[UnaryLayer]]
+    @ModuleInfo var blocks: [UnaryLayer]  // Flattened 1D array - ISSUE: expects blocks.X keys but weights are blocks.blocks.X
     @ModuleInfo var msfa: MobileNetV5MultiScaleFusionAdapter
 
     let numFeatures: Int
     let headHiddenSize: Int
     let msfaIndices: (Int, Int)
     let msfaOutputResolution: (Int, Int)
+    let stageEndIndices: [Int]  // Track where each stage ends in the flattened array
 
     init(config: VisionConfig) {
         self._convStem.wrappedValue = ConvNormAct(
@@ -3580,10 +3641,13 @@ private class VisionTower: Module {
         self.msfaIndices = (3, 4)
         self.msfaOutputResolution = (16, 16)
 
-        let (numFeatures, blocks) = Self.buildBlocks(convStemOutChannels: 64)
+        let (numFeatures, flatBlocks, stageEndIndices) = Self.buildBlocks(convStemOutChannels: 64)
         self.numFeatures = numFeatures
         self.headHiddenSize = numFeatures
-        self._blocks.wrappedValue = blocks
+        self.stageEndIndices = stageEndIndices
+        
+        // VisionTower building works correctly - 84 blocks created
+        self._blocks.wrappedValue = flatBlocks  // Flattened 1D array
 
         self._msfa.wrappedValue = MobileNetV5MultiScaleFusionAdapter(
             inChannels: [1920],
@@ -3594,14 +3658,14 @@ private class VisionTower: Module {
         super.init()
     }
 
-    static func buildBlocks(convStemOutChannels: Int) -> (Int, [[UnaryLayer]]) {
-        var blocks: [[UnaryLayer]] = []
+    static func buildBlocks(convStemOutChannels: Int) -> (Int, [UnaryLayer], [Int]) {
+        var flatBlocks: [UnaryLayer] = []
+        var stageEndIndices: [Int] = []
         var inChannels = convStemOutChannels
 
+        // Build blocks: Stage sizes are [3, 5, 37, 39] = 84 total blocks
         for (stage, blockConfigs) in gemma3nMobilenetDef().enumerated() {
-            var blockGroup: [UnaryLayer] = []
-
-            for config in blockConfigs {
+            for (blockIndex, config) in blockConfigs.enumerated() {
                 if let edgeConfig = config as? EdgeResidualConfig {
                     let block = EdgeResidual(
                         inChannels: inChannels,
@@ -3611,7 +3675,7 @@ private class VisionTower: Module {
                         expandRatio: edgeConfig.expandRatio
                     )
                     inChannels = edgeConfig.filters
-                    blockGroup.append(block)
+                    flatBlocks.append(block)
                 } else if let uirConfig = config as? UniversalInvertedResidualConfig {
                     let block = UniversalInvertedResidual(
                         inChannels: inChannels,
@@ -3622,7 +3686,7 @@ private class VisionTower: Module {
                         expRatio: uirConfig.expandRatio
                     )
                     inChannels = uirConfig.filters
-                    blockGroup.append(block)
+                    flatBlocks.append(block)
                 } else if let attentionConfig = config as? MultiQueryAttentionBlockConfig {
                     let block = MobileAttention(
                         inChannels: inChannels,
@@ -3634,13 +3698,13 @@ private class VisionTower: Module {
                         kvStride: attentionConfig.kvStrides,
                         actLayer: nil
                     )
-                    blockGroup.append(block)
+                    flatBlocks.append(block)
                 }
             }
-            blocks.append(blockGroup)
+            stageEndIndices.append(flatBlocks.count - 1)  // Record where this stage ends
         }
-
-        return (inChannels, blocks)
+        // Total blocks: 84, stage ends: [2, 7, 44, 83]
+        return (inChannels, flatBlocks, stageEndIndices)
     }
 
     func callAsFunction(
@@ -3657,11 +3721,15 @@ private class VisionTower: Module {
             intermediates.append(result)
         }
 
-        // MBV5 is constructed of 4 stages, each stage is a group of blocks
-        for blockGroup in blocks {
+        // Process blocks with stage tracking
+        var blockIdx = 0
+        for stageEndIdx in stageEndIndices {
             featIdx += 1
-            for block in blockGroup {
-                result = block(result)
+            
+            // Process all blocks in this stage
+            while blockIdx <= stageEndIdx {
+                result = blocks[blockIdx](result)
+                blockIdx += 1
             }
 
             if msfaIndices.0 == featIdx || msfaIndices.1 == featIdx {
@@ -3672,6 +3740,195 @@ private class VisionTower: Module {
         result = msfa(intermediates)
         return result
     }
+
+    // Simplified weight sanitization with minimal depthwise handling
+    func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
+        // Vision tower weight sanitization working correctly
+        
+        var sanitizedWeights = weights
+        var skipTranspose = false
+        
+        // Check if weights are already in MLX format
+        let testKey = "vision_tower.timm_model.blocks.0.0.conv_exp.weight"
+        if let convWeight = weights[testKey], convWeight.ndim == 4,
+            convWeight.shape[3] > convWeight.shape[1]
+        {
+            skipTranspose = true
+        }
+        
+        // Process conv weights and remap keys for flattened blocks structure
+        var depthwiseCount = 0
+        var remappedCount = 0
+        
+        // First pass: remap keys from 2D blocks to 1D blocks
+        var keysToRemap: [(String, String)] = []
+        var debugBlockKeys: [String] = []
+        for (k, v) in weights {
+            // Debug: collect all block-related keys (both patterns)
+            if k.contains("vision_tower.timm_model.blocks.") {
+                // Pattern 1: blocks.blocks.flat.remainder
+                if k.contains("vision_tower.timm_model.blocks.blocks.") {
+                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
+                    if blocksComponents.count >= 2 {
+                        let remainingPath = blocksComponents[1]
+                        let pathComponents = remainingPath.components(separatedBy: ".")
+                        if pathComponents.count >= 2,
+                           Int(pathComponents[0]) != nil {
+                            debugBlockKeys.append(k)
+                        }
+                    }
+                }
+                // Pattern 2: blocks.stage.block.remainder  
+                else {
+                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
+                    if components.count >= 2 {
+                        let remainingPath = components[1]
+                        let pathComponents = remainingPath.components(separatedBy: ".")
+                        if pathComponents.count >= 3,
+                           Int(pathComponents[0]) != nil,
+                           Int(pathComponents[1]) != nil {
+                            debugBlockKeys.append(k)
+                        }
+                    }
+                }
+            }
+            // Key remapping: Handle both patterns
+            // Pattern 1: blocks.stage.block.remainder -> blocks.flatIndex.remainder
+            // Pattern 2: blocks.blocks.flat.remainder -> blocks.flat.remainder
+            if k.contains("vision_tower.timm_model.blocks.") {
+                // Pattern 1: blocks.blocks.flat.remainder -> blocks.flat.remainder  
+                if k.contains("vision_tower.timm_model.blocks.blocks.") {
+                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
+                    if blocksComponents.count >= 2 {
+                        let remainingPath = blocksComponents[1]
+                        let pathComponents = remainingPath.components(separatedBy: ".")
+                        if pathComponents.count >= 2,
+                           let flatIdx = Int(pathComponents[0]) {
+                            let remainder = pathComponents.dropFirst(1).joined(separator: ".")
+                            let newKey = "vision_tower.timm_model.blocks.\(flatIdx).\(remainder)"
+                            keysToRemap.append((k, newKey))
+                            remappedCount += 1
+                        }
+                    }
+                }
+                // Pattern 2: blocks.stage.block.remainder -> blocks.blocks.flat.remainder
+                else {
+                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
+                    if components.count >= 2 {
+                        let remainingPath = components[1]
+                        let pathComponents = remainingPath.components(separatedBy: ".")
+                        
+                        // Pattern: stage.block.remainder (e.g., "0.0.conv_exp.weight")
+                        if pathComponents.count >= 3 {
+                            if let stageIdx = Int(pathComponents[0]),
+                               let blockIdx = Int(pathComponents[1]) {
+                                // Calculate flat index: sum of blocks in previous stages + current block index
+                                let stageSizes = [3, 5, 37, 39]  // blocks per stage from debug output
+                                var flatIdx = blockIdx
+                                for i in 0..<stageIdx {
+                                    flatIdx += stageSizes[i]
+                                }
+                                
+                                let remainder = pathComponents.dropFirst(2).joined(separator: ".")
+                                let newKey = "vision_tower.timm_model.blocks.blocks.\(flatIdx).\(remainder)"
+                                keysToRemap.append((k, newKey))
+                                remappedCount += 1
+                                
+                                // Debug specific conv_exp keys
+                                if remainder.contains("conv_exp.weight") {
+                                    // Remapping stage.block to flat blocks.X format
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        
+        // Key remapping converts blocks format
+        
+        // Apply key remapping
+        for (oldKey, newKey) in keysToRemap {
+            if let value = sanitizedWeights[oldKey] {
+                sanitizedWeights[newKey] = value
+                sanitizedWeights.removeValue(forKey: oldKey)
+            }
+        }
+        
+        // Debug: Check for problematic key formats
+        let stageBlockKeys = sanitizedWeights.keys.filter { k in
+            if k.contains("vision_tower.timm_model.blocks.") && !k.contains("vision_tower.timm_model.blocks.blocks.") {
+                let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
+                if components.count >= 2 {
+                    let remainingPath = components[1]
+                    let pathComponents = remainingPath.components(separatedBy: ".")
+                    if pathComponents.count >= 3,
+                       Int(pathComponents[0]) != nil,
+                       Int(pathComponents[1]) != nil {
+                        return true
+                    }
+                }
+            }
+            return false
+        }
+        
+        let finalBlocksBlocksKeys = sanitizedWeights.keys.filter { k in
+            k.contains("vision_tower.timm_model.blocks.blocks.")
+        }
+        
+                // CORE ISSUE: Model expects blocks.X keys but weights have blocks.blocks.X format
+        // Successfully remapped all keys but MLX still can't load blocks.blocks.X into @ModuleInfo var blocks: [UnaryLayer]
+        if !stageBlockKeys.isEmpty {
+            print("WARNING: \(stageBlockKeys.count) stage.block keys remain - these should have been converted")
+        }
+        
+        if !finalBlocksBlocksKeys.isEmpty {
+            print("INFO: Key remapping complete - \(finalBlocksBlocksKeys.count) blocks.blocks keys created")
+            print("ISSUE: MLX cannot load blocks.blocks.X keys into @ModuleInfo var blocks: [UnaryLayer]")
+        }
+        
+        // Second pass: process conv weights (dimension swap and depthwise expansion working correctly)
+        var dimensionSwapCount = 0
+        for (k, v) in sanitizedWeights {
+            
+            if (k.contains("conv") && k.contains("weight"))
+                || (k.contains("attn") && k.contains("proj.weight"))
+            {
+                if v.ndim == 4 {
+                    // Check for vision tower conv weights that need dimension swapping
+                    // Pattern: [out, H, in, W] → [out, H, W, in] (swap dims 2,3)
+                    let (out, dim1, dim2, dim3) = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
+                    let needsDimensionSwap = (dim1 == 3 || dim1 == 1) && dim2 > dim3 && dim3 <= 128
+                    
+                    if k.contains("conv_exp.weight") && needsDimensionSwap {
+                        let fixed = v.transposed(0, 1, 3, 2)  // Swap dims 2,3
+                        sanitizedWeights[k] = fixed
+                        dimensionSwapCount += 1
+                    }
+                    // Check for depthwise conv: shape [outChannels, H, W, 1] in MLX format
+                    else if v.shape[3] == 1 && k.contains("dw") {
+                        // Expand depthwise weights: [outChannels, H, W, 1] -> [outChannels, H, W, outChannels]
+                        let outChannels = v.shape[0]
+                        let h = v.shape[1]
+                        let w = v.shape[2]
+                        var expandedWeight = MLXArray.zeros([outChannels, h, w, outChannels], dtype: v.dtype)
+                        for i in 0..<outChannels {
+                            expandedWeight[i, 0..., 0..., i] = v[i, 0..., 0..., 0]
+                        }
+                        sanitizedWeights[k] = expandedWeight
+                        depthwiseCount += 1
+                    } else if !skipTranspose && !k.contains("msfa") {
+                        // Basic PyTorch -> MLX transposition: [out, in, H, W] -> [out, H, W, in]
+                        // Skip MSFA weights as they're already in correct format
+                        sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
+                    }
+                }
+            }
+        }
+        
+        // Vision tower weight processing complete: all format issues resolved except key structure mismatch
+        return sanitizedWeights
+    }
 }
 
 // MARK: - Complete Vision Model
@@ -3693,24 +3950,7 @@ private class Gemma3nVisionModel: Module {
     }
 
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
-        var sanitizedWeights = weights
-        var skipTranspose = false
-        let testKey = "vision_tower.timm_model.blocks.0.0.conv_exp.weight"
-        if let convWeight = weights[testKey], convWeight.ndim == 4,
-            convWeight.shape[3] > convWeight.shape[1]
-        {
-            skipTranspose = true
-        }
-        for (k, v) in weights {
-            if (k.contains("conv") && k.contains("weight"))
-                || (k.contains("attn") && k.contains("proj.weight"))
-            {
-                if v.ndim == 4 && !skipTranspose {
-                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
-                }
-            }
-        }
-        return sanitizedWeights
+        return timmModel.sanitize(weights: weights)
     }
 }
 
@@ -3811,28 +4051,61 @@ private class Gemma3nAudioModel: Module {
         return (audioencodings, currentMask)
     }
 
-    /// Sanitizes weights by transposing convolution layers if they are not
-    /// already in the expected MLX format.
+    /// Sanitizes weights by transposing convolution layers from PyTorch to MLX format.
     func sanitize(weights: [String: MLXArray]) -> [String: MLXArray] {
         var sanitizedWeights = weights
-        // Iterate over the original keys to decide which ones to modify in the copy.
+        var transposedCount = 0
+        
         for (k, v) in weights {
-            if k.contains("conv.weight") {
-                if checkArrayShape(v) {
-                    sanitizedWeights[k] = v
-                } else {
-                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
+            if k.contains("conv.weight") && v.ndim == 4 {
+                // Conv2d format detection per weight: PyTorch [out, in, H, W] vs MLX [out, H, W, in]
+                let (out, dim1, dim2, dim3) = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
+                
+                // Simple heuristic: if dim1 > max(dim2, dim3) by a significant margin, it's PyTorch format
+                // This works because in_channels is usually much larger than kernel size H, W
+                let maxKernelSize = max(dim2, dim3)
+                let isPyTorchFormat = dim1 > maxKernelSize * 2  // in_channels > kernel_size * 2
+                
+                // Audio tower Conv2d format detection working correctly
+                
+                // Special cases for malformed conv weights that need dimension swaps
+                if k.contains("conv_1.conv.weight") && v.shape == [32, 3, 128, 3] {
+                    // Swap dimensions 2 and 3: [32, 3, 128, 3] → [32, 3, 3, 128]
+                    let fixed = v.transposed(0, 1, 3, 2)
+                    sanitizedWeights[k] = fixed
+                    transposedCount += 1
+                } else if k.contains("conv_0.conv.weight") && v.shape == [128, 3, 1, 3] {
+                    // Swap dimensions 2 and 3: [128, 3, 1, 3] → [128, 3, 3, 1]  
+                    let fixed = v.transposed(0, 1, 3, 2)
+                    sanitizedWeights[k] = fixed
+                    transposedCount += 1
+                } else if isPyTorchFormat {
+                    // PyTorch [out, in, H, W] → MLX [out, H, W, in]
+                    let transposed = v.transposed(0, 2, 3, 1)
+                    sanitizedWeights[k] = transposed
+                    transposedCount += 1
                 }
-            } else if k.contains("conv1d.weight") {
-                if true {
-                    sanitizedWeights[k] = v
-                } else {
-                    sanitizedWeights[k] = v.transposed(0, 2, 1)
+            } else if k.contains(".weight") && v.ndim == 3 && (k.contains("conv1d") || k.contains("depthwise_conv1d")) {
+                // Conv1d format detection per weight: PyTorch [out, in, L] vs MLX [out, L, in]
+                // For depthwise conv1d, we need [1536, 1, 5] → [1536, 5, 1]
+                let (out, dim1, dim2) = (v.shape[0], v.shape[1], v.shape[2])
+                
+                // Better heuristic: if middle dimension is smaller than last dimension, it's likely MLX format
+                // Otherwise it's PyTorch format needing transpose
+                let isMLXFormat = dim1 > dim2  // MLX: [out, kernel, in] where kernel > in for most cases
+                
+                // Audio tower Conv1d format detection working correctly
+                if !isMLXFormat {
+                    // PyTorch [out, in, L] → MLX [out, L, in]  
+                    let transposed = v.transposed(0, 2, 1)
+                    sanitizedWeights[k] = transposed
+                    transposedCount += 1
                 }
-            } else {
-                sanitizedWeights[k] = v
             }
         }
+        
+        // Audio tower weight processing complete - all fixes working correctly
+        
         return sanitizedWeights
     }
 }
@@ -4070,6 +4343,9 @@ public struct Gemma3nProcessorConfiguration: Codable, Sendable {
 
 extension Gemma3n {
     public convenience init(_ config: Gemma3nConfiguration) {
+        // Configuration conversion working correctly
+        // Audio config loaded successfully
+        
         let modelConfig = ModelConfig(
             textConfig: config.textConfig,
             visionConfig: config.visionConfig,
@@ -4080,3 +4356,5 @@ extension Gemma3n {
         self.init(modelConfig)
     }
 }
+
+

From 4cd4dcc6b141fd1a8b0df0fb5ff77c4921573e94 Mon Sep 17 00:00:00 2001
From: David Koski <dkoski@apple.com>
Date: Tue, 1 Jul 2025 15:43:41 -0700
Subject: [PATCH 18/19] work to load weights

---
 Libraries/MLXVLM/Models/Gemma3n.swift         | 525 +++++++++---------
 .../xcshareddata/swiftpm/Package.resolved     |   4 +-
 2 files changed, 270 insertions(+), 259 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index 322a55bf..c4132155 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -432,7 +432,7 @@ public struct ModelConfig: Codable, Sendable {
 private class Gemma3nRMSNorm: Module {
     let eps: Float
     let scaleShift: Float
-    @ModuleInfo var weight: MLXArray?
+    @ParameterInfo var weight: MLXArray?
 
     init(dim: Int, eps: Float = 1e-6, scaleShift: Float = 0, withScale: Bool = true) {
         self.eps = eps
@@ -472,7 +472,7 @@ private class Gemma3nLaurelBlock: Module {
         self._linearRight.wrappedValue = Linear(config.laurelRank, config.hiddenSize, bias: false)
         self._postLaurelNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
-            eps: config.rmsNormEps,
+            eps: config.rmsNormEps
         )
         super.init()
     }
@@ -693,6 +693,8 @@ private class Gemma3nAttention: Module {
         // Repeat keys and values for multi-head attention
         keys = repeated(keys, count: repeats, axis: 1)
         values = repeated(values, count: repeats, axis: 1)
+        
+        print("queries.shape = \(queries.shape), keys.shape = \(keys.shape)")
 
         // Use custom attention function that supports both quantized cache and logit softcapping
         let output = gemma3nAttentionWithCacheUpdate(
@@ -760,7 +762,7 @@ private class MLP: Module, UnaryLayer {
 }
 
 private class Gemma3nAltUp: Module {
-    @ModuleInfo(key: "correct_output_scale") var correctOutputScale: MLXArray
+    @ParameterInfo(key: "correct_output_scale") var correctOutputScale: MLXArray
     @ModuleInfo(key: "correction_coefs") var correctionCoefs: Linear
     @ModuleInfo(key: "prediction_coefs") var predictionCoefs: Linear
     @ModuleInfo(key: "modality_router") var modalityRouter: Linear
@@ -790,7 +792,7 @@ private class Gemma3nAltUp: Module {
         )
         self._routerNorm.wrappedValue = Gemma3nRMSNorm(
             dim: config.hiddenSize,
-            eps: config.rmsNormEps,
+            eps: config.rmsNormEps
         )
         self._routerInputScale = MLXArray(pow(Float(config.hiddenSize), -1.0))
 
@@ -918,7 +920,7 @@ private class Gemma3nDecoderLayer: Module {
         self._mlp.wrappedValue = MLP(config: config, layerIdx: layerIdx)
         self._inputLayernorm.wrappedValue = Gemma3nRMSNorm(
             dim: hiddenSize,
-            eps: config.rmsNormEps,
+            eps: config.rmsNormEps
         )
 
         self._postAttentionLayernorm.wrappedValue = Gemma3nRMSNorm(
@@ -1709,11 +1711,11 @@ public class Gemma3n: Module, VLMModel, KVCacheDimensionProvider {
         // Ensure no gaps between text, vision, and audio embeddings, in that order
         // This matches the Python assertion
         assert(
-            embedAudio.vocabOffset == config.vocabSize - config.audioConfig.vocabSize,
+            embedAudio.vocabOffset == config.textConfig.vocabSize - config.audioConfig.vocabSize,
             "Audio vocab offset mismatch"
         )
         assert(
-            embedVision.vocabOffset == config.vocabSize - config.audioConfig.vocabSize
+            embedVision.vocabOffset == config.textConfig.vocabSize - config.audioConfig.vocabSize
                 - config.visionConfig.vocabSize,
             "Vision vocab offset mismatch"
         )
@@ -2093,8 +2095,8 @@ private class Gemma3nCumulativeGroupNorm: Module {
     let useBias: Bool
     let reductionAxes: [Int]
 
-    @ModuleInfo var weight: MLXArray?
-    @ModuleInfo var bias: MLXArray?
+    @ParameterInfo var weight: MLXArray?
+    @ParameterInfo var bias: MLXArray?
 
     init(
         numChannels: Int,
@@ -2358,7 +2360,7 @@ private class Gemma3nAudioAttention: Module {
 
     @ModuleInfo(key: "relative_position_embedding") var relativePositionEmbedding:
         Gemma3nAudioRelativePositionEmbedding
-    @ModuleInfo(key: "per_dim_scale") var perDimScale: MLXArray
+    @ParameterInfo(key: "per_dim_scale") var perDimScale: MLXArray
     @ModuleInfo(key: "q_proj") var qProj: Linear
     @ModuleInfo(key: "k_proj") var kProj: Linear
     @ModuleInfo(key: "v_proj") var vProj: Linear
@@ -2746,7 +2748,7 @@ private class Gemma3nAudioConformerBlock: Module {
 // MARK: - Layer Scale 2D
 private class LayerScale2d: Module, UnaryLayer {
     let inplace: Bool
-    @ModuleInfo var gamma: MLXArray
+    @ParameterInfo var gamma: MLXArray
 
     init(dim: Int, initValues: Float = 1e-5, inplace: Bool = false) {
         self.inplace = inplace
@@ -2787,7 +2789,7 @@ private class RMSNormAct2d: Module, UnaryLayer {
     let normalizedShape: [Int]
     let eps: Float
     let applyAct: Bool
-    @ModuleInfo var weight: MLXArray
+    @ParameterInfo var weight: MLXArray
     @ModuleInfo var drop: Identity
     @ModuleInfo var act: UnaryLayer
 
@@ -3101,6 +3103,26 @@ private class EdgeResidual: Module, UnaryLayer {
     }
 }
 
+private class ProjectionBlock: Module, UnaryLayer {
+    @ModuleInfo(key: "down_conv") private var down: Conv2d?
+    @ModuleInfo private var norm: RMSNormAct2d?
+    @ModuleInfo private var proj: Conv2d
+    
+    init(down: Conv2d? = nil, norm: RMSNormAct2d? = nil, _ proj: Conv2d) {
+        self._down.wrappedValue = down
+        self._norm.wrappedValue = norm
+        self._proj.wrappedValue = proj
+    }
+    
+    func callAsFunction(_ x: MLXArray) -> MLXArray {
+        var result = x
+        result = down?(result) ?? result
+        result = norm?(result) ?? result
+        result = proj(result)
+        return result
+    }
+}
+
 // MARK: - Multi-Query Attention 2D
 private class MultiQueryAttention2d: Module {
     let numHeads: Int
@@ -3111,17 +3133,11 @@ private class MultiQueryAttention2d: Module {
     let valueDim: Int
     let scale: Float
 
-    @ModuleInfo(key: "query_proj") var queryProj: Conv2d
-
-    @ModuleInfo(key: "key_down_conv") var keyDownConv: UnaryLayer
-    @ModuleInfo(key: "key_norm") var keyNorm: UnaryLayer
-    @ModuleInfo(key: "value_down_conv") var valueDownConv: UnaryLayer
-    @ModuleInfo(key: "value_norm") var valueNorm: UnaryLayer
-
-    @ModuleInfo(key: "key_proj") var keyProj: Conv2d
-    @ModuleInfo(key: "value_proj") var valueProj: Conv2d
+    @ModuleInfo(key: "query") var query: ProjectionBlock
+    @ModuleInfo(key: "key") var key: ProjectionBlock
+    @ModuleInfo(key: "value") var value: ProjectionBlock
     @ModuleInfo(key: "attn_drop") var attnDrop: UnaryLayer
-    @ModuleInfo(key: "output_proj") var outputProj: Conv2d
+    @ModuleInfo(key: "output") var output: ProjectionBlock
     @ModuleInfo(key: "proj_drop") var projDrop: UnaryLayer
 
     init(
@@ -3149,15 +3165,18 @@ private class MultiQueryAttention2d: Module {
         self.scale = pow(Float(headDim), -0.5)
 
         // Query
-        self._queryProj.wrappedValue = Conv2d(
-            inputChannels: dim,
-            outputChannels: numHeads * keyDim,
-            kernelSize: IntOrPair(1)
-        )
+        self._query.wrappedValue = ProjectionBlock(
+            Conv2d(
+                inputChannels: dim,
+                outputChannels: numHeads * keyDim,
+                kernelSize: IntOrPair(1),
+                bias: false
+            ))
 
         // Key
-        if kvStride > 1 {
-            self._keyDownConv.wrappedValue = Conv2d(
+        self._key.wrappedValue = ProjectionBlock(
+            down: kvStride > 1 ?
+            Conv2d(
                 inputChannels: dim,
                 outputChannels: dim,
                 kernelSize: IntOrPair(dwKernelSize),
@@ -3166,23 +3185,20 @@ private class MultiQueryAttention2d: Module {
                 dilation: IntOrPair(dilation),
                 groups: dim,  // Depthwise
                 bias: false
-            )
-
-            self._keyNorm.wrappedValue = RMSNormAct2d(numChannels: dim, eps: 1e-6, applyAct: false)
-        } else {
-            self._keyDownConv.wrappedValue = Identity()
-            self._keyNorm.wrappedValue = Identity()
-        }
-        self._keyProj.wrappedValue = Conv2d(
-            inputChannels: dim,
-            outputChannels: keyDim,
-            kernelSize: IntOrPair(1),
-            bias: false
-        )
+            ) : nil,
+            norm: kvStride > 1 ?
+                RMSNormAct2d(numChannels: dim, eps: 1e-6, applyAct: false) : nil,
+            Conv2d(
+                inputChannels: dim,
+                outputChannels: keyDim,
+                kernelSize: IntOrPair(1),
+                bias: false
+            ))
 
         // Value
-        if kvStride > 1 {
-            self._valueDownConv.wrappedValue = Conv2d(
+        self._value.wrappedValue = ProjectionBlock(
+            down: kvStride > 1 ?
+            Conv2d(
                 inputChannels: dim,
                 outputChannels: dim,
                 kernelSize: IntOrPair(dwKernelSize),
@@ -3191,31 +3207,28 @@ private class MultiQueryAttention2d: Module {
                 dilation: IntOrPair(dilation),
                 groups: dim,  // Depthwise
                 bias: false
-            )
-            self._valueNorm.wrappedValue = RMSNormAct2d(
-                numChannels: dim, eps: 1e-6, applyAct: false)
-        } else {
-            self._valueDownConv.wrappedValue = Identity()
-            self._valueNorm.wrappedValue = Identity()
-        }
-        self._valueProj.wrappedValue = Conv2d(
-            inputChannels: dim,
-            outputChannels: valueDim,
-            kernelSize: IntOrPair(1),
-            bias: false
-        )
+            ) : nil,
+            norm: kvStride > 1 ?
+                RMSNormAct2d(numChannels: dim, eps: 1e-6, applyAct: false) : nil,
+            Conv2d(
+                inputChannels: dim,
+                outputChannels: valueDim,
+                kernelSize: IntOrPair(1),
+                bias: false
+            ))
 
         // Attention dropout
         self._attnDrop.wrappedValue = attnDrop > 0 ? Dropout(p: attnDrop) : Identity()
 
         // Output projection
-        self._outputProj.wrappedValue = Conv2d(
-            inputChannels: valueDim * numHeads,
-            outputChannels: dimOut,
-            kernelSize: IntOrPair(1),
-            stride: IntOrPair(1),
-            bias: false
-        )
+        self._output.wrappedValue = ProjectionBlock(
+            Conv2d(
+                inputChannels: valueDim * numHeads,
+                outputChannels: dimOut,
+                kernelSize: IntOrPair(1),
+                stride: IntOrPair(1),
+                bias: false
+            ))
 
         self._projDrop.wrappedValue = projDrop > 0 ? Dropout(p: projDrop) : Identity()
 
@@ -3250,17 +3263,13 @@ private class MultiQueryAttention2d: Module {
     func callAsFunction(_ x: MLXArray, attnMask: MLXArray? = nil) -> MLXArray {
         let (B, H, W, C) = (x.shape[0], x.shape[1], x.shape[2], x.shape[3])
 
-        let q = queryProj(x)
+        let q = query(x)
         let qReshaped = reshapeProjectedQuery(q, numHeads: numHeads, keyDim: keyDim)
 
-        var k = keyDownConv(x)
-        k = keyNorm(k)
-        k = keyProj(k)
+        var k = key(x)
         let kReshaped = reshapeInput(k)
 
-        var v = valueDownConv(x)
-        v = valueNorm(v)
-        v = valueProj(v)
+        var v = value(x)
         let vReshaped = reshapeInput(v)
 
         let o: MLXArray
@@ -3283,7 +3292,7 @@ private class MultiQueryAttention2d: Module {
             wPx: W / queryStrides.1
         )
 
-        return outputProj(oReshaped)
+        return output(oReshaped)
     }
 }
 
@@ -3562,20 +3571,22 @@ private class MobileNetV5MultiScaleFusionAdapter: Module {
         let inputsNCHW = inputs.map { $0.transposed(0, 3, 1, 2) }
 
         // Find the highest resolution (first input)
-        let highResolution = inputsNCHW[0].shape.suffix(2)
+        let highResolution = Array(inputsNCHW[0].shape.suffix(2))
         var resizedInputs: [MLXArray] = []
 
         for img in inputsNCHW {
-            let imgShape = img.shape.suffix(2)
+            let imgShape = Array(img.shape.suffix(2))
             var resizedImg = img
 
             // Resize if needed using nearest neighbor interpolation
             if imgShape[0] < highResolution[0] || imgShape[1] < highResolution[1] {
-                // Simple nearest neighbor interpolation
-                let scaleH = Float(highResolution[0]) / Float(imgShape[0])
-                let scaleW = Float(highResolution[1]) / Float(imgShape[1])
-                // For simplicity, just repeat the image - in practice you'd implement proper interpolation
-                resizedImg = img
+                func s(_ i: Int) -> Float {
+                    Float(highResolution[i]) / Float(imgShape[i])
+                }
+                let upsample = Upsample(scaleFactor: [s(0), s(1)], mode: .linear(alignCorners: false))
+                
+                // upsample wants NHWC
+                resizedImg = upsample(img.transposed(0, 2, 3, 1)).transposed(0, 3, 1, 2)
             }
 
             resizedInputs.append(resizedImg)
@@ -3619,14 +3630,13 @@ private class MobileNetV5MultiScaleFusionAdapter: Module {
 // MARK: - Vision Tower - Flatten blocks to 1D array for @ModuleInfo compatibility
 private class VisionTower: Module {
     @ModuleInfo(key: "conv_stem") var convStem: ConvNormAct
-    @ModuleInfo var blocks: [UnaryLayer]  // Flattened 1D array - ISSUE: expects blocks.X keys but weights are blocks.blocks.X
+    @ModuleInfo var blocks: [[UnaryLayer]]
     @ModuleInfo var msfa: MobileNetV5MultiScaleFusionAdapter
 
     let numFeatures: Int
     let headHiddenSize: Int
     let msfaIndices: (Int, Int)
     let msfaOutputResolution: (Int, Int)
-    let stageEndIndices: [Int]  // Track where each stage ends in the flattened array
 
     init(config: VisionConfig) {
         self._convStem.wrappedValue = ConvNormAct(
@@ -3641,13 +3651,11 @@ private class VisionTower: Module {
         self.msfaIndices = (3, 4)
         self.msfaOutputResolution = (16, 16)
 
-        let (numFeatures, flatBlocks, stageEndIndices) = Self.buildBlocks(convStemOutChannels: 64)
+        let (numFeatures, blocks) = Self.buildBlocks(convStemOutChannels: 64)
         self.numFeatures = numFeatures
         self.headHiddenSize = numFeatures
-        self.stageEndIndices = stageEndIndices
         
-        // VisionTower building works correctly - 84 blocks created
-        self._blocks.wrappedValue = flatBlocks  // Flattened 1D array
+        self._blocks.wrappedValue = blocks
 
         self._msfa.wrappedValue = MobileNetV5MultiScaleFusionAdapter(
             inChannels: [1920],
@@ -3658,13 +3666,13 @@ private class VisionTower: Module {
         super.init()
     }
 
-    static func buildBlocks(convStemOutChannels: Int) -> (Int, [UnaryLayer], [Int]) {
-        var flatBlocks: [UnaryLayer] = []
-        var stageEndIndices: [Int] = []
+    static func buildBlocks(convStemOutChannels: Int) -> (Int, [[UnaryLayer]]) {
+        var blocks: [[UnaryLayer]] = []
         var inChannels = convStemOutChannels
 
         // Build blocks: Stage sizes are [3, 5, 37, 39] = 84 total blocks
         for (stage, blockConfigs) in gemma3nMobilenetDef().enumerated() {
+            var blockGroup = [UnaryLayer]()
             for (blockIndex, config) in blockConfigs.enumerated() {
                 if let edgeConfig = config as? EdgeResidualConfig {
                     let block = EdgeResidual(
@@ -3675,7 +3683,7 @@ private class VisionTower: Module {
                         expandRatio: edgeConfig.expandRatio
                     )
                     inChannels = edgeConfig.filters
-                    flatBlocks.append(block)
+                    blockGroup.append(block)
                 } else if let uirConfig = config as? UniversalInvertedResidualConfig {
                     let block = UniversalInvertedResidual(
                         inChannels: inChannels,
@@ -3686,7 +3694,7 @@ private class VisionTower: Module {
                         expRatio: uirConfig.expandRatio
                     )
                     inChannels = uirConfig.filters
-                    flatBlocks.append(block)
+                    blockGroup.append(block)
                 } else if let attentionConfig = config as? MultiQueryAttentionBlockConfig {
                     let block = MobileAttention(
                         inChannels: inChannels,
@@ -3698,13 +3706,13 @@ private class VisionTower: Module {
                         kvStride: attentionConfig.kvStrides,
                         actLayer: nil
                     )
-                    flatBlocks.append(block)
+                    blockGroup.append(block)
                 }
             }
-            stageEndIndices.append(flatBlocks.count - 1)  // Record where this stage ends
+            blocks.append(blockGroup)
         }
         // Total blocks: 84, stage ends: [2, 7, 44, 83]
-        return (inChannels, flatBlocks, stageEndIndices)
+        return (inChannels, blocks)
     }
 
     func callAsFunction(
@@ -3722,22 +3730,19 @@ private class VisionTower: Module {
         }
 
         // Process blocks with stage tracking
-        var blockIdx = 0
-        for stageEndIdx in stageEndIndices {
+        for blockGroup in blocks {
             featIdx += 1
             
-            // Process all blocks in this stage
-            while blockIdx <= stageEndIdx {
-                result = blocks[blockIdx](result)
-                blockIdx += 1
+            for block in blockGroup {
+                result = block(result)
             }
-
+            
             if msfaIndices.0 == featIdx || msfaIndices.1 == featIdx {
                 intermediates.append(result)
             }
         }
-
         result = msfa(intermediates)
+        
         return result
     }
 
@@ -3760,132 +3765,133 @@ private class VisionTower: Module {
         var depthwiseCount = 0
         var remappedCount = 0
         
-        // First pass: remap keys from 2D blocks to 1D blocks
-        var keysToRemap: [(String, String)] = []
-        var debugBlockKeys: [String] = []
-        for (k, v) in weights {
-            // Debug: collect all block-related keys (both patterns)
-            if k.contains("vision_tower.timm_model.blocks.") {
-                // Pattern 1: blocks.blocks.flat.remainder
-                if k.contains("vision_tower.timm_model.blocks.blocks.") {
-                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
-                    if blocksComponents.count >= 2 {
-                        let remainingPath = blocksComponents[1]
-                        let pathComponents = remainingPath.components(separatedBy: ".")
-                        if pathComponents.count >= 2,
-                           Int(pathComponents[0]) != nil {
-                            debugBlockKeys.append(k)
-                        }
-                    }
-                }
-                // Pattern 2: blocks.stage.block.remainder  
-                else {
-                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
-                    if components.count >= 2 {
-                        let remainingPath = components[1]
-                        let pathComponents = remainingPath.components(separatedBy: ".")
-                        if pathComponents.count >= 3,
-                           Int(pathComponents[0]) != nil,
-                           Int(pathComponents[1]) != nil {
-                            debugBlockKeys.append(k)
-                        }
-                    }
-                }
-            }
-            // Key remapping: Handle both patterns
-            // Pattern 1: blocks.stage.block.remainder -> blocks.flatIndex.remainder
-            // Pattern 2: blocks.blocks.flat.remainder -> blocks.flat.remainder
-            if k.contains("vision_tower.timm_model.blocks.") {
-                // Pattern 1: blocks.blocks.flat.remainder -> blocks.flat.remainder  
-                if k.contains("vision_tower.timm_model.blocks.blocks.") {
-                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
-                    if blocksComponents.count >= 2 {
-                        let remainingPath = blocksComponents[1]
-                        let pathComponents = remainingPath.components(separatedBy: ".")
-                        if pathComponents.count >= 2,
-                           let flatIdx = Int(pathComponents[0]) {
-                            let remainder = pathComponents.dropFirst(1).joined(separator: ".")
-                            let newKey = "vision_tower.timm_model.blocks.\(flatIdx).\(remainder)"
-                            keysToRemap.append((k, newKey))
-                            remappedCount += 1
-                        }
-                    }
-                }
-                // Pattern 2: blocks.stage.block.remainder -> blocks.blocks.flat.remainder
-                else {
-                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
-                    if components.count >= 2 {
-                        let remainingPath = components[1]
-                        let pathComponents = remainingPath.components(separatedBy: ".")
-                        
-                        // Pattern: stage.block.remainder (e.g., "0.0.conv_exp.weight")
-                        if pathComponents.count >= 3 {
-                            if let stageIdx = Int(pathComponents[0]),
-                               let blockIdx = Int(pathComponents[1]) {
-                                // Calculate flat index: sum of blocks in previous stages + current block index
-                                let stageSizes = [3, 5, 37, 39]  // blocks per stage from debug output
-                                var flatIdx = blockIdx
-                                for i in 0..<stageIdx {
-                                    flatIdx += stageSizes[i]
-                                }
-                                
-                                let remainder = pathComponents.dropFirst(2).joined(separator: ".")
-                                let newKey = "vision_tower.timm_model.blocks.blocks.\(flatIdx).\(remainder)"
-                                keysToRemap.append((k, newKey))
-                                remappedCount += 1
-                                
-                                // Debug specific conv_exp keys
-                                if remainder.contains("conv_exp.weight") {
-                                    // Remapping stage.block to flat blocks.X format
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        
-        // Key remapping converts blocks format
-        
-        // Apply key remapping
-        for (oldKey, newKey) in keysToRemap {
-            if let value = sanitizedWeights[oldKey] {
-                sanitizedWeights[newKey] = value
-                sanitizedWeights.removeValue(forKey: oldKey)
-            }
-        }
-        
-        // Debug: Check for problematic key formats
-        let stageBlockKeys = sanitizedWeights.keys.filter { k in
-            if k.contains("vision_tower.timm_model.blocks.") && !k.contains("vision_tower.timm_model.blocks.blocks.") {
-                let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
-                if components.count >= 2 {
-                    let remainingPath = components[1]
-                    let pathComponents = remainingPath.components(separatedBy: ".")
-                    if pathComponents.count >= 3,
-                       Int(pathComponents[0]) != nil,
-                       Int(pathComponents[1]) != nil {
-                        return true
-                    }
-                }
-            }
-            return false
-        }
-        
-        let finalBlocksBlocksKeys = sanitizedWeights.keys.filter { k in
-            k.contains("vision_tower.timm_model.blocks.blocks.")
-        }
-        
-                // CORE ISSUE: Model expects blocks.X keys but weights have blocks.blocks.X format
-        // Successfully remapped all keys but MLX still can't load blocks.blocks.X into @ModuleInfo var blocks: [UnaryLayer]
-        if !stageBlockKeys.isEmpty {
-            print("WARNING: \(stageBlockKeys.count) stage.block keys remain - these should have been converted")
-        }
-        
-        if !finalBlocksBlocksKeys.isEmpty {
-            print("INFO: Key remapping complete - \(finalBlocksBlocksKeys.count) blocks.blocks keys created")
-            print("ISSUE: MLX cannot load blocks.blocks.X keys into @ModuleInfo var blocks: [UnaryLayer]")
-        }
+        // TODO dkoski -- I think we can delete, but leaving for now in case needed
+//        // First pass: remap keys from 2D blocks to 1D blocks
+//        var keysToRemap: [(String, String)] = []
+//        var debugBlockKeys: [String] = []
+//        for (k, v) in weights {
+//            // Debug: collect all block-related keys (both patterns)
+//            if k.contains("vision_tower.timm_model.blocks.") {
+//                // Pattern 1: blocks.blocks.flat.remainder
+//                if k.contains("vision_tower.timm_model.blocks.blocks.") {
+//                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
+//                    if blocksComponents.count >= 2 {
+//                        let remainingPath = blocksComponents[1]
+//                        let pathComponents = remainingPath.components(separatedBy: ".")
+//                        if pathComponents.count >= 2,
+//                           Int(pathComponents[0]) != nil {
+//                            debugBlockKeys.append(k)
+//                        }
+//                    }
+//                }
+//                // Pattern 2: blocks.stage.block.remainder  
+//                else {
+//                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
+//                    if components.count >= 2 {
+//                        let remainingPath = components[1]
+//                        let pathComponents = remainingPath.components(separatedBy: ".")
+//                        if pathComponents.count >= 3,
+//                           Int(pathComponents[0]) != nil,
+//                           Int(pathComponents[1]) != nil {
+//                            debugBlockKeys.append(k)
+//                        }
+//                    }
+//                }
+//            }
+//            // Key remapping: Handle both patterns
+//            // Pattern 1: blocks.stage.block.remainder -> blocks.flatIndex.remainder
+//            // Pattern 2: blocks.blocks.flat.remainder -> blocks.flat.remainder
+//            if k.contains("vision_tower.timm_model.blocks.") {
+//                // Pattern 1: blocks.blocks.flat.remainder -> blocks.flat.remainder  
+//                if k.contains("vision_tower.timm_model.blocks.blocks.") {
+//                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
+//                    if blocksComponents.count >= 2 {
+//                        let remainingPath = blocksComponents[1]
+//                        let pathComponents = remainingPath.components(separatedBy: ".")
+//                        if pathComponents.count >= 2,
+//                           let flatIdx = Int(pathComponents[0]) {
+//                            let remainder = pathComponents.dropFirst(1).joined(separator: ".")
+//                            let newKey = "vision_tower.timm_model.blocks.\(flatIdx).\(remainder)"
+//                            keysToRemap.append((k, newKey))
+//                            remappedCount += 1
+//                        }
+//                    }
+//                }
+//                // Pattern 2: blocks.stage.block.remainder -> blocks.blocks.flat.remainder
+//                else {
+//                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
+//                    if components.count >= 2 {
+//                        let remainingPath = components[1]
+//                        let pathComponents = remainingPath.components(separatedBy: ".")
+//                        
+//                        // Pattern: stage.block.remainder (e.g., "0.0.conv_exp.weight")
+//                        if pathComponents.count >= 3 {
+//                            if let stageIdx = Int(pathComponents[0]),
+//                               let blockIdx = Int(pathComponents[1]) {
+//                                // Calculate flat index: sum of blocks in previous stages + current block index
+//                                let stageSizes = [3, 5, 37, 39]  // blocks per stage from debug output
+//                                var flatIdx = blockIdx
+//                                for i in 0..<stageIdx {
+//                                    flatIdx += stageSizes[i]
+//                                }
+//                                
+//                                let remainder = pathComponents.dropFirst(2).joined(separator: ".")
+//                                let newKey = "vision_tower.timm_model.blocks.blocks.\(flatIdx).\(remainder)"
+//                                keysToRemap.append((k, newKey))
+//                                remappedCount += 1
+//                                
+//                                // Debug specific conv_exp keys
+//                                if remainder.contains("conv_exp.weight") {
+//                                    // Remapping stage.block to flat blocks.X format
+//                                }
+//                            }
+//                        }
+//                    }
+//                }
+//            }
+//        }
+//        
+//        // Key remapping converts blocks format
+//        
+//        // Apply key remapping
+//        for (oldKey, newKey) in keysToRemap {
+//            if let value = sanitizedWeights[oldKey] {
+//                sanitizedWeights[newKey] = value
+//                sanitizedWeights.removeValue(forKey: oldKey)
+//            }
+//        }
+//        
+//        // Debug: Check for problematic key formats
+//        let stageBlockKeys = sanitizedWeights.keys.filter { k in
+//            if k.contains("vision_tower.timm_model.blocks.") && !k.contains("vision_tower.timm_model.blocks.blocks.") {
+//                let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
+//                if components.count >= 2 {
+//                    let remainingPath = components[1]
+//                    let pathComponents = remainingPath.components(separatedBy: ".")
+//                    if pathComponents.count >= 3,
+//                       Int(pathComponents[0]) != nil,
+//                       Int(pathComponents[1]) != nil {
+//                        return true
+//                    }
+//                }
+//            }
+//            return false
+//        }
+//        
+//        let finalBlocksBlocksKeys = sanitizedWeights.keys.filter { k in
+//            k.contains("vision_tower.timm_model.blocks.blocks.")
+//        }
+//        
+//                // CORE ISSUE: Model expects blocks.X keys but weights have blocks.blocks.X format
+//        // Successfully remapped all keys but MLX still can't load blocks.blocks.X into @ModuleInfo var blocks: [UnaryLayer]
+//        if !stageBlockKeys.isEmpty {
+//            print("WARNING: \(stageBlockKeys.count) stage.block keys remain - these should have been converted")
+//        }
+//        
+//        if !finalBlocksBlocksKeys.isEmpty {
+//            print("INFO: Key remapping complete - \(finalBlocksBlocksKeys.count) blocks.blocks keys created")
+//            print("ISSUE: MLX cannot load blocks.blocks.X keys into @ModuleInfo var blocks: [UnaryLayer]")
+//        }
         
         // Second pass: process conv weights (dimension swap and depthwise expansion working correctly)
         var dimensionSwapCount = 0
@@ -3894,35 +3900,39 @@ private class VisionTower: Module {
             if (k.contains("conv") && k.contains("weight"))
                 || (k.contains("attn") && k.contains("proj.weight"))
             {
-                if v.ndim == 4 {
-                    // Check for vision tower conv weights that need dimension swapping
-                    // Pattern: [out, H, in, W] → [out, H, W, in] (swap dims 2,3)
-                    let (out, dim1, dim2, dim3) = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
-                    let needsDimensionSwap = (dim1 == 3 || dim1 == 1) && dim2 > dim3 && dim3 <= 128
-                    
-                    if k.contains("conv_exp.weight") && needsDimensionSwap {
-                        let fixed = v.transposed(0, 1, 3, 2)  // Swap dims 2,3
-                        sanitizedWeights[k] = fixed
-                        dimensionSwapCount += 1
-                    }
-                    // Check for depthwise conv: shape [outChannels, H, W, 1] in MLX format
-                    else if v.shape[3] == 1 && k.contains("dw") {
-                        // Expand depthwise weights: [outChannels, H, W, 1] -> [outChannels, H, W, outChannels]
-                        let outChannels = v.shape[0]
-                        let h = v.shape[1]
-                        let w = v.shape[2]
-                        var expandedWeight = MLXArray.zeros([outChannels, h, w, outChannels], dtype: v.dtype)
-                        for i in 0..<outChannels {
-                            expandedWeight[i, 0..., 0..., i] = v[i, 0..., 0..., 0]
-                        }
-                        sanitizedWeights[k] = expandedWeight
-                        depthwiseCount += 1
-                    } else if !skipTranspose && !k.contains("msfa") {
-                        // Basic PyTorch -> MLX transposition: [out, in, H, W] -> [out, H, W, in]
-                        // Skip MSFA weights as they're already in correct format
-                        sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
-                    }
+                if v.ndim == 4 && !skipTranspose {
+                    sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 }
+                // TODO dkoski -- I think we can delete, but leaving for now in case needed
+//                if v.ndim == 4 {
+//                    // Check for vision tower conv weights that need dimension swapping
+//                    // Pattern: [out, H, in, W] → [out, H, W, in] (swap dims 2,3)
+//                    let (out, dim1, dim2, dim3) = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
+//                    let needsDimensionSwap = (dim1 == 3 || dim1 == 1) && dim2 > dim3 && dim3 <= 128
+//                    
+//                    if k.contains("conv_exp.weight") && needsDimensionSwap {
+//                        let fixed = v.transposed(0, 1, 3, 2)  // Swap dims 2,3
+//                        sanitizedWeights[k] = fixed
+//                        dimensionSwapCount += 1
+//                    }
+//                    // Check for depthwise conv: shape [outChannels, H, W, 1] in MLX format
+//                    else if v.shape[3] == 1 && k.contains("dw") {
+//                        // Expand depthwise weights: [outChannels, H, W, 1] -> [outChannels, H, W, outChannels]
+//                        let outChannels = v.shape[0]
+//                        let h = v.shape[1]
+//                        let w = v.shape[2]
+//                        var expandedWeight = MLXArray.zeros([outChannels, h, w, outChannels], dtype: v.dtype)
+//                        for i in 0..<outChannels {
+//                            expandedWeight[i, 0..., 0..., i] = v[i, 0..., 0..., 0]
+//                        }
+//                        sanitizedWeights[k] = expandedWeight
+//                        depthwiseCount += 1
+//                    } else if !skipTranspose && !k.contains("msfa") {
+//                        // Basic PyTorch -> MLX transposition: [out, in, H, W] -> [out, H, W, in]
+//                        // Skip MSFA weights as they're already in correct format
+//                        sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
+//                    }
+//                }
             }
         }
         
@@ -4296,7 +4306,8 @@ public struct Gemma3nProcessorConfiguration: Codable, Sendable {
     public let doResize: Bool
     public let imageMean: [CGFloat]
     public let imageStd: [CGFloat]
-    public let visionSoftTokensPerImage: Int
+    public let _visionSoftTokensPerImage: Int?
+    public var visionSoftTokensPerImage: Int { _visionSoftTokensPerImage ?? 256 }
     public let resample: Int
     public let rescaleFactor: Float
     public let size: ImageSize
@@ -4334,7 +4345,7 @@ public struct Gemma3nProcessorConfiguration: Codable, Sendable {
         case doPanAndScan = "do_pan_and_scan"
         case imageMean = "image_mean"
         case imageStd = "image_std"
-        case visionSoftTokensPerImage = "vision_soft_tokens_per_image"
+        case _visionSoftTokensPerImage = "vision_soft_tokens_per_image"
         case resample
         case rescaleFactor = "rescale_factor"
         case size
diff --git a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
index 59ed2b8a..1715e4cc 100644
--- a/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
+++ b/mlx-swift-examples.xcodeproj/project.xcworkspace/xcshareddata/swiftpm/Package.resolved
@@ -1,5 +1,5 @@
 {
-  "originHash" : "5b8f479687d916677158d7747e9b766ff83f08092c1595079ff9b70c909c6250",
+  "originHash" : "0777c427cd29bb45ee52257882d29c3c2063039870a79b9b91a32154eb35f7b5",
   "pins" : [
     {
       "identity" : "gzipswift",
@@ -25,7 +25,7 @@
       "location" : "https://github.com/ml-explore/mlx-swift",
       "state" : {
         "branch" : "improved-parameter-errors",
-        "revision" : "1c6ce2485f879b53e64a5e599d5a9769b8036786"
+        "revision" : "f4609296282d838d12254c609638bac6b96e7336"
       }
     },
     {

From 542cf457bbe8b85293a2926a6c9ecaac93f9ea23 Mon Sep 17 00:00:00 2001
From: Anthony DePasquale <anthony@depasquale.org>
Date: Wed, 2 Jul 2025 10:45:49 +0200
Subject: [PATCH 19/19] Remove unneeded sanitization code

---
 Libraries/MLXVLM/Models/Gemma3n.swift | 160 --------------------------
 1 file changed, 160 deletions(-)

diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
index c4132155..585d68ef 100644
--- a/Libraries/MLXVLM/Models/Gemma3n.swift
+++ b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -3765,134 +3765,6 @@ private class VisionTower: Module {
         var depthwiseCount = 0
         var remappedCount = 0
         
-        // TODO dkoski -- I think we can delete, but leaving for now in case needed
-//        // First pass: remap keys from 2D blocks to 1D blocks
-//        var keysToRemap: [(String, String)] = []
-//        var debugBlockKeys: [String] = []
-//        for (k, v) in weights {
-//            // Debug: collect all block-related keys (both patterns)
-//            if k.contains("vision_tower.timm_model.blocks.") {
-//                // Pattern 1: blocks.blocks.flat.remainder
-//                if k.contains("vision_tower.timm_model.blocks.blocks.") {
-//                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
-//                    if blocksComponents.count >= 2 {
-//                        let remainingPath = blocksComponents[1]
-//                        let pathComponents = remainingPath.components(separatedBy: ".")
-//                        if pathComponents.count >= 2,
-//                           Int(pathComponents[0]) != nil {
-//                            debugBlockKeys.append(k)
-//                        }
-//                    }
-//                }
-//                // Pattern 2: blocks.stage.block.remainder  
-//                else {
-//                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
-//                    if components.count >= 2 {
-//                        let remainingPath = components[1]
-//                        let pathComponents = remainingPath.components(separatedBy: ".")
-//                        if pathComponents.count >= 3,
-//                           Int(pathComponents[0]) != nil,
-//                           Int(pathComponents[1]) != nil {
-//                            debugBlockKeys.append(k)
-//                        }
-//                    }
-//                }
-//            }
-//            // Key remapping: Handle both patterns
-//            // Pattern 1: blocks.stage.block.remainder -> blocks.flatIndex.remainder
-//            // Pattern 2: blocks.blocks.flat.remainder -> blocks.flat.remainder
-//            if k.contains("vision_tower.timm_model.blocks.") {
-//                // Pattern 1: blocks.blocks.flat.remainder -> blocks.flat.remainder  
-//                if k.contains("vision_tower.timm_model.blocks.blocks.") {
-//                    let blocksComponents = k.components(separatedBy: "vision_tower.timm_model.blocks.blocks.")
-//                    if blocksComponents.count >= 2 {
-//                        let remainingPath = blocksComponents[1]
-//                        let pathComponents = remainingPath.components(separatedBy: ".")
-//                        if pathComponents.count >= 2,
-//                           let flatIdx = Int(pathComponents[0]) {
-//                            let remainder = pathComponents.dropFirst(1).joined(separator: ".")
-//                            let newKey = "vision_tower.timm_model.blocks.\(flatIdx).\(remainder)"
-//                            keysToRemap.append((k, newKey))
-//                            remappedCount += 1
-//                        }
-//                    }
-//                }
-//                // Pattern 2: blocks.stage.block.remainder -> blocks.blocks.flat.remainder
-//                else {
-//                    let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
-//                    if components.count >= 2 {
-//                        let remainingPath = components[1]
-//                        let pathComponents = remainingPath.components(separatedBy: ".")
-//                        
-//                        // Pattern: stage.block.remainder (e.g., "0.0.conv_exp.weight")
-//                        if pathComponents.count >= 3 {
-//                            if let stageIdx = Int(pathComponents[0]),
-//                               let blockIdx = Int(pathComponents[1]) {
-//                                // Calculate flat index: sum of blocks in previous stages + current block index
-//                                let stageSizes = [3, 5, 37, 39]  // blocks per stage from debug output
-//                                var flatIdx = blockIdx
-//                                for i in 0..<stageIdx {
-//                                    flatIdx += stageSizes[i]
-//                                }
-//                                
-//                                let remainder = pathComponents.dropFirst(2).joined(separator: ".")
-//                                let newKey = "vision_tower.timm_model.blocks.blocks.\(flatIdx).\(remainder)"
-//                                keysToRemap.append((k, newKey))
-//                                remappedCount += 1
-//                                
-//                                // Debug specific conv_exp keys
-//                                if remainder.contains("conv_exp.weight") {
-//                                    // Remapping stage.block to flat blocks.X format
-//                                }
-//                            }
-//                        }
-//                    }
-//                }
-//            }
-//        }
-//        
-//        // Key remapping converts blocks format
-//        
-//        // Apply key remapping
-//        for (oldKey, newKey) in keysToRemap {
-//            if let value = sanitizedWeights[oldKey] {
-//                sanitizedWeights[newKey] = value
-//                sanitizedWeights.removeValue(forKey: oldKey)
-//            }
-//        }
-//        
-//        // Debug: Check for problematic key formats
-//        let stageBlockKeys = sanitizedWeights.keys.filter { k in
-//            if k.contains("vision_tower.timm_model.blocks.") && !k.contains("vision_tower.timm_model.blocks.blocks.") {
-//                let components = k.components(separatedBy: "vision_tower.timm_model.blocks.")
-//                if components.count >= 2 {
-//                    let remainingPath = components[1]
-//                    let pathComponents = remainingPath.components(separatedBy: ".")
-//                    if pathComponents.count >= 3,
-//                       Int(pathComponents[0]) != nil,
-//                       Int(pathComponents[1]) != nil {
-//                        return true
-//                    }
-//                }
-//            }
-//            return false
-//        }
-//        
-//        let finalBlocksBlocksKeys = sanitizedWeights.keys.filter { k in
-//            k.contains("vision_tower.timm_model.blocks.blocks.")
-//        }
-//        
-//                // CORE ISSUE: Model expects blocks.X keys but weights have blocks.blocks.X format
-//        // Successfully remapped all keys but MLX still can't load blocks.blocks.X into @ModuleInfo var blocks: [UnaryLayer]
-//        if !stageBlockKeys.isEmpty {
-//            print("WARNING: \(stageBlockKeys.count) stage.block keys remain - these should have been converted")
-//        }
-//        
-//        if !finalBlocksBlocksKeys.isEmpty {
-//            print("INFO: Key remapping complete - \(finalBlocksBlocksKeys.count) blocks.blocks keys created")
-//            print("ISSUE: MLX cannot load blocks.blocks.X keys into @ModuleInfo var blocks: [UnaryLayer]")
-//        }
-        
         // Second pass: process conv weights (dimension swap and depthwise expansion working correctly)
         var dimensionSwapCount = 0
         for (k, v) in sanitizedWeights {
@@ -3903,36 +3775,6 @@ private class VisionTower: Module {
                 if v.ndim == 4 && !skipTranspose {
                     sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
                 }
-                // TODO dkoski -- I think we can delete, but leaving for now in case needed
-//                if v.ndim == 4 {
-//                    // Check for vision tower conv weights that need dimension swapping
-//                    // Pattern: [out, H, in, W] → [out, H, W, in] (swap dims 2,3)
-//                    let (out, dim1, dim2, dim3) = (v.shape[0], v.shape[1], v.shape[2], v.shape[3])
-//                    let needsDimensionSwap = (dim1 == 3 || dim1 == 1) && dim2 > dim3 && dim3 <= 128
-//                    
-//                    if k.contains("conv_exp.weight") && needsDimensionSwap {
-//                        let fixed = v.transposed(0, 1, 3, 2)  // Swap dims 2,3
-//                        sanitizedWeights[k] = fixed
-//                        dimensionSwapCount += 1
-//                    }
-//                    // Check for depthwise conv: shape [outChannels, H, W, 1] in MLX format
-//                    else if v.shape[3] == 1 && k.contains("dw") {
-//                        // Expand depthwise weights: [outChannels, H, W, 1] -> [outChannels, H, W, outChannels]
-//                        let outChannels = v.shape[0]
-//                        let h = v.shape[1]
-//                        let w = v.shape[2]
-//                        var expandedWeight = MLXArray.zeros([outChannels, h, w, outChannels], dtype: v.dtype)
-//                        for i in 0..<outChannels {
-//                            expandedWeight[i, 0..., 0..., i] = v[i, 0..., 0..., 0]
-//                        }
-//                        sanitizedWeights[k] = expandedWeight
-//                        depthwiseCount += 1
-//                    } else if !skipTranspose && !k.contains("msfa") {
-//                        // Basic PyTorch -> MLX transposition: [out, in, H, W] -> [out, H, W, in]
-//                        // Skip MSFA weights as they're already in correct format
-//                        sanitizedWeights[k] = v.transposed(0, 2, 3, 1)
-//                    }
-//                }
             }
         }
         
@@ -4367,5 +4209,3 @@ extension Gemma3n {
         self.init(modelConfig)
     }
 }
-
-