Custom attention with cache update

DePasqualeOrg · DePasqualeOrg · commit 2d4fcadceb98 · 2025-06-27T10:40:46.000+02:00
diff --git a/Libraries/MLXVLM/Models/Gemma3n.swift b/Libraries/MLXVLM/Models/Gemma3n.swift
@@ -538,33 +538,24 @@ private class Gemma3nAttention: Module {
       values = vProj(x).reshaped(hiddenShape)
       values = vNorm(values)
       values = values.transposed(0, 2, 1, 3)
-
-      if let cache = cache {
-        (keys, values) = cache.update(keys: keys, values: values)
-      }
     }
 
+    // Repeat keys and values for multi-head attention
     keys = repeated(keys, count: repeats, axis: 1)
     values = repeated(values, count: repeats, axis: 1)
 
-    var attnWeights = matmul(queries, keys.swappedAxes(2, 3)) * scale
-
-    if attnLogitSoftcapping > 0 {
-      attnWeights = attnWeights / attnLogitSoftcapping
-      attnWeights = tanh(attnWeights)
-      attnWeights = attnWeights * attnLogitSoftcapping
-    }
-
-    if case .array(let maskArray) = mask {
-      let causalMask = maskArray[0..., ..<keys.shape[2]]
-      attnWeights = attnWeights + causalMask
-    }
-
-    attnWeights = softmax(attnWeights.asType(.float32), axis: -1).asType(queries.dtype)
-
-    let output = matmul(attnWeights, values)
-      .transposed(0, 2, 1, 3)
-      .reshaped(inputShape + [-1])
+    // Use custom attention function that supports both quantized cache and logit softcapping
+    let output = gemma3nAttentionWithCacheUpdate(
+      queries: queries,
+      keys: keys,
+      values: values,
+      cache: cache,
+      scale: scale,
+      attnLogitSoftcapping: attnLogitSoftcapping,
+      mask: mask ?? .none
+    )
+    .transposed(0, 2, 1, 3)
+    .reshaped(inputShape + [-1])
 
     return oProj(output)
   }
@@ -1308,6 +1299,72 @@ private class Gemma3nMultimodalEmbedder: Module, UnaryLayer {
 
 // MARK: - Helper Functions
 
+// MARK: - Custom Attention for Gemma3n with Logit Softcapping
+
+/// Custom attention function for Gemma3n that supports:
+/// - Logit softcapping (applied before softmax) 
+/// - Standard KV cache support
+/// - Exact alignment with Python implementation
+///
+/// TODO: Quantized KV Cache Integration
+/// Action items for adding quantized cache support:
+/// 1. Add QuantizedKVCache detection: `if let quantizedKVCache = cache as? QuantizedKVCache`
+/// 2. Use quantizedKVCache.updateQuantized(keys: keys, values: values) for cache update
+/// 3. Implement manual quantized attention computation with logit softcapping:
+///    - Cannot use quantizedScaledDotProductAttention directly (no softcapping support)
+///    - Need to manually compute: matmul(queries, dequantized_keys) with softcapping
+///    - May require dequantization of keys for logit softcapping application
+/// 4. Consider performance trade-offs: 
+///    - Manual dequantization vs quantized attention benefits
+///    - Might need hybrid approach or dedicated quantized+softcapping function
+/// 5. Test with QuantizedKVCache to ensure numerical accuracy matches Python
+/// 6. Update documentation and examples
+private func gemma3nAttentionWithCacheUpdate(
+  queries: MLXArray,
+  keys: MLXArray,
+  values: MLXArray,
+  cache: KVCache?,
+  scale: Float,
+  attnLogitSoftcapping: Float,
+  mask: MLXFast.ScaledDotProductAttentionMaskMode = .none
+) -> MLXArray {
+  // Update cache and get cached keys/values (matches Python's cache.update_and_fetch)
+  let (cachedKeys, cachedValues): (MLXArray, MLXArray)
+  
+  if let cache = cache {
+    (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)
+  } else {
+    (cachedKeys, cachedValues) = (keys, values)
+  }
+  
+  // Manual attention computation to support logit softcapping
+  // This matches the Python implementation exactly:
+  // attn_weights = mx.matmul(queries, keys.swapaxes(2, 3)) * self.scale
+  var attnWeights = matmul(queries, cachedKeys.swappedAxes(2, 3)) * scale
+  
+  // Apply logit softcapping if enabled (matches Python)
+  // if self.attn_logit_softcapping is not None and self.attn_logit_softcapping > 0:
+  if attnLogitSoftcapping > 0 {
+    attnWeights = attnWeights / attnLogitSoftcapping
+    attnWeights = tanh(attnWeights)
+    attnWeights = attnWeights * attnLogitSoftcapping
+  }
+  
+  // Apply mask if provided (matches Python)
+  // if mask is not None: causal_mask = mask[:, : keys.shape[-2]]
+  if case .array(let maskArray) = mask {
+    let causalMask = maskArray[0..., ..<cachedKeys.shape[2]]
+    attnWeights = attnWeights + causalMask
+  }
+  
+  // Apply softmax and compute output (matches Python)
+  // attn_weights = mx.softmax(attn_weights.astype(mx.float32), axis=-1).astype(queries.dtype)
+  attnWeights = softmax(attnWeights.asType(.float32), axis: -1).asType(queries.dtype)
+  
+  // output = mx.matmul(attn_weights, values)
+  return matmul(attnWeights, cachedValues)
+}
+
 private func bicubicInterpolate(_ x: MLXArray, to targetSize: (Int, Int), alignCorners: Bool = false) -> MLXArray {
   // TODO: This implementation uses nested loops and sequential MLX operations, which is much slower
   // than the Python version that uses mx.fast.metal_kernel() for parallel GPU computation.