ml-explore
diff --git a/‎Libraries/MLXLLM/Models/Qwen3.swift
Lines changed: 9 additions & 3 deletions b/‎Libraries/MLXLLM/Models/Qwen3.swift
Lines changed: 9 additions & 3 deletions
diff --git a/‎Libraries/MLXLMCommon/AttentionUtils.swift
Lines changed: 77 additions & 0 deletions b/‎Libraries/MLXLMCommon/AttentionUtils.swift
Lines changed: 77 additions & 0 deletions
diff --git a/‎Libraries/MLXLMCommon/Evaluate.swift
Lines changed: 47 additions & 0 deletions b/‎Libraries/MLXLMCommon/Evaluate.swift
Lines changed: 47 additions & 0 deletions
@@ -76,17 +76,23 @@ private class Attention: Module {
         keys = kNorm(keys.reshaped(B, L, args.kvHeads, -1)).transposed(0, 2, 1, 3)
         values = values.reshaped(B, L, args.kvHeads, -1).transposed(0, 2, 1, 3)
 
+        // Apply RoPE positioning
         if let cache {
             queries = rope(queries, offset: cache.offset)
             keys = rope(keys, offset: cache.offset)
-            (keys, values) = cache.update(keys: keys, values: values)
         } else {
             queries = rope(queries)
             keys = rope(keys)
         }
 
-        let output = MLXFast.scaledDotProductAttention(
-            queries: queries, keys: keys, values: values, scale: scale, mask: mask
+        // Use the automatic attention router that handles both quantized and regular caches
+        let output = attentionWithCacheUpdate(
+            queries: queries,
+            keys: keys,
+            values: values,
+            cache: cache,
+            scale: scale,
+            mask: mask
         )
         .transposed(0, 2, 1, 3)
         .reshaped(B, L, -1)
 
@@ -0,0 +1,77 @@
+import Foundation
+import MLX
+import MLXFast
+
+/// Attention utilities that match Python mlx-lm's interface
+///
+/// This provides a single function that automatically routes to quantized or regular
+/// attention based on cache type, matching Python's `scaled_dot_product_attention`
+
+/// Automatic attention with cache update
+///
+/// This function matches Python's `scaled_dot_product_attention` in base.py:
+/// - Detects if cache is `QuantizedKVCache` using `isinstance` pattern
+/// - Routes to `quantizedScaledDotProductAttention` or `MLXFast.scaledDotProductAttention`
+/// - Handles cache updating automatically
+/// - Transparent to models - they just call this function
+///
+/// **Usage in models:**
+/// ```swift
+/// let output = attentionWithCacheUpdate(
+///     queries: queries,
+///     keys: keys,
+///     values: values,
+///     cache: cache,
+///     scale: scale,
+///     mask: mask
+/// )
+/// ```
+///
+/// - Parameters:
+///   - queries: Query tensor [B, nHeads, L, D]
+///   - keys: Raw key tensor to be cached [B, nKVHeads, L, D]
+///   - values: Raw value tensor to be cached [B, nKVHeads, L, D]
+///   - cache: Cache instance (any type)
+///   - scale: Attention scale factor
+///   - mask: Attention mask
+/// - Returns: Attention output [B, nHeads, L, D]
+public func attentionWithCacheUpdate(
+    queries: MLXArray,
+    keys: MLXArray,
+    values: MLXArray,
+    cache: KVCache?,
+    scale: Float,
+    mask: MLXFast.ScaledDotProductAttentionMaskMode = .none
+) -> MLXArray {
+    guard let cache else {
+        return MLXFast.scaledDotProductAttention(
+            queries: queries,
+            keys: keys,
+            values: values,
+            scale: scale,
+            mask: mask
+        )
+    }
+    if let quantizedKVCache = cache as? QuantizedKVCache {
+        let (quantizedKeys, quantizedValues) = quantizedKVCache.updateQuantized(
+            keys: keys, values: values)
+        return quantizedScaledDotProductAttention(
+            queries: queries,
+            quantizedKeys: quantizedKeys,
+            quantizedValues: quantizedValues,
+            scale: scale,
+            mask: mask,
+            groupSize: quantizedKVCache.groupSize,
+            bits: quantizedKVCache.bits
+        )
+    } else {
+        let (cachedKeys, cachedValues) = cache.update(keys: keys, values: values)
+        return MLXFast.scaledDotProductAttention(
+            queries: queries,
+            keys: cachedKeys,
+            values: cachedValues,
+            scale: scale,
+            mask: mask
+        )
+    }
+}
@@ -59,6 +59,19 @@ public struct GenerateParameters: Sendable {
     /// Maximum tokens to generate
     public var maxTokens: Int?
 
+    /// Maximum size of the key-value cache. Old entries (except the first 4 tokens) will be overwritten.
+    /// When set, uses ``RotatingKVCache`` instead of ``KVCacheSimple``
+    public var maxKVSize: Int?
+
+    /// Number of bits to use for KV cache quantization. nil implies no cache quantization.
+    public var kvBits: Int?
+
+    /// Group size for KV cache quantization (default: 64)
+    public var kvGroupSize: Int = 64
+
+    /// Step to begin using a quantized KV cache when kvBits is non-nil (default: 0)
+    public var quantizedKVStart: Int = 0
+
     /// sampling temperature
     public var temperature: Float = 0.6
 
@@ -73,10 +86,18 @@ public struct GenerateParameters: Sendable {
 
     public init(
         maxTokens: Int? = nil,
+        maxKVSize: Int? = nil,
+        kvBits: Int? = nil,
+        kvGroupSize: Int = 64,
+        quantizedKVStart: Int = 0,
         temperature: Float = 0.6, topP: Float = 1.0, repetitionPenalty: Float? = nil,
         repetitionContextSize: Int = 20
     ) {
         self.maxTokens = maxTokens
+        self.maxKVSize = maxKVSize
+        self.kvBits = kvBits
+        self.kvGroupSize = kvGroupSize
+        self.quantizedKVStart = quantizedKVStart
         self.temperature = temperature
         self.topP = topP
         self.repetitionPenalty = repetitionPenalty
@@ -257,6 +278,11 @@ public struct TokenIterator: Sequence, IteratorProtocol {
     var tokenCount = 0
     let maxTokens: Int?
 
+    // Cache quantization parameters
+    let kvBits: Int?
+    let kvGroupSize: Int
+    let quantizedKVStart: Int
+
     /// Initialize a `TokenIterator` with the given tokens. Note: this has been
     /// replaced with ``init(input:model:cache:parameters:)``.
     ///
@@ -278,6 +304,10 @@ public struct TokenIterator: Sequence, IteratorProtocol {
         self.sampler = parameters.sampler()
         self.maxTokens = parameters.maxTokens
 
+        self.kvBits = parameters.kvBits
+        self.kvGroupSize = parameters.kvGroupSize
+        self.quantizedKVStart = parameters.quantizedKVStart
+
         try prepare(input: .init(text: y), windowSize: parameters.prefillStepSize)
     }
 
@@ -305,6 +335,10 @@ public struct TokenIterator: Sequence, IteratorProtocol {
         self.sampler = parameters.sampler()
         self.maxTokens = parameters.maxTokens
 
+        self.kvBits = parameters.kvBits
+        self.kvGroupSize = parameters.kvGroupSize
+        self.quantizedKVStart = parameters.quantizedKVStart
+
         try prepare(input: input, windowSize: parameters.prefillStepSize)
     }
 
@@ -331,6 +365,11 @@ public struct TokenIterator: Sequence, IteratorProtocol {
         self.sampler = sampler
         self.maxTokens = maxTokens
 
+        // No cache quantization for this direct initialization
+        self.kvBits = nil
+        self.kvGroupSize = 64
+        self.quantizedKVStart = 0
+
         try prepare(input: input, windowSize: prefillStepSize)
     }
 
@@ -373,6 +412,14 @@ public struct TokenIterator: Sequence, IteratorProtocol {
             previous[text: .newAxis], cache: cache.isEmpty ? nil : cache, state: state)
         self.state = result.state
 
+        // Apply dynamic cache quantization after each step
+        maybeQuantizeKVCache(
+            cache: &cache,
+            kvBits: kvBits,
+            kvGroupSize: kvGroupSize,
+            quantizedKVStart: quantizedKVStart
+        )
+
         return convertToToken(logits: result.logits)
     }