Added actor-based PromptCache and implemented in MLXChatExample

jolonf · jolonf · commit 0400695889a6 · 2025-05-04T11:24:25.000+10:00
diff --git a/Applications/MLXChatExample/Services/MLXService.swift b/Applications/MLXChatExample/Services/MLXService.swift
@@ -33,10 +33,10 @@ class MLXService {
     ]
 
     /// Cache to store loaded model containers to avoid reloading.
-    private let modelCache = NSCache<NSString, ModelContainer>()
+    private var modelCache: [String : ModelContainer] = [:]
 
-    /// Tracks the ID of the last model used to detect changes for cache invalidation.
-    private var lastUsedModelId: String?
+    /// Stores a prompt cache for each loaded model
+    private var promptCache: [String : PromptCache] = [:]
 
     /// Tracks the current model download progress.
     /// Access this property to monitor model download status.
@@ -52,7 +52,7 @@ class MLXService {
         MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)
 
         // Return cached model if available to avoid reloading
-        if let container = modelCache.object(forKey: model.name as NSString) {
+        if let container = modelCache[model.name] {
             return container
         } else {
             // Select appropriate factory based on model type
@@ -74,8 +74,7 @@ class MLXService {
             }
 
             // Cache the loaded model for future use
-            modelCache.setObject(container, forKey: model.name as NSString)
-
+            modelCache[model.name] = container
             return container
         }
     }
@@ -90,14 +89,6 @@ class MLXService {
         // Load or retrieve model from cache
         let modelContainer = try await load(model: model)
 
-        // Check if the model has changed since last generation
-        if lastUsedModelId != model.name {
-            // Clear the cache if the model is different
-            await modelContainer.clearCache()
-            print("[MLXService] Model changed, cleared KV Cache.")
-            lastUsedModelId = model.name
-        }
-
         // Map app-specific Message type to Chat.Message for model input
         let chat = messages.map { message in
             let role: Chat.Message.Role =
@@ -123,48 +114,36 @@ class MLXService {
 
         // Generate response using the model
         return try await modelContainer.perform { (context: ModelContext) in
-            // --- Prompt Caching Logic ---
-            // Only prefill if there are more than just the initial system message and the current turn
-            // (user + empty assistant = 2). Assumes first message is system.
-            if messages.count > 3 {
-                // Prepare history: all messages except the last (empty assistant) one.
-                // The `processor.prepare` below handles the *full* input including the latest user message.
-                let historyMessages = Array(chat.dropLast()) // Drop the empty assistant message
-                let historyUserInput = UserInput(chat: historyMessages)
-
-                // Try to get history tokens. Need tokenizer from context.
-                do {
-                    // Attempt to use the processor first, as it might handle VLM details.
-                    // Note: This runs prepare twice (once for history, once for full), which is suboptimal.
-                    // A better approach might involve direct tokenizer access or a dedicated history tokenization method.
-                    let historyLmInput = try await context.processor.prepare(input: historyUserInput)
-                    let historyTokens = historyLmInput.text.tokens.asArray(Int.self)
-
-                    // Check if current cache offset matches history length
-                    let currentCacheOffset = context.kvCache?.first?.offset ?? 0 // Assuming single cache for now
-
-                    if currentCacheOffset != historyTokens.count {
-                        print("[MLXService] Prefilling cache for \(historyTokens.count) history tokens. Current cache offset: \(currentCacheOffset)")
-                        await modelContainer.prefill(promptTokens: historyTokens)
-                    } else {
-                        print("[MLXService] Cache already matches history length (\(currentCacheOffset) tokens). Skipping prefill.")
-                    }
-                } catch {
-                    // Fallback or error handling if history tokenization fails
-                    print("[MLXService] Warning: Could not prepare history tokens for prefill. Error: \(error). Proceeding without prefill.")
-                    // Ensure cache is clear if we couldn't reliably check/prefill
-                    await modelContainer.clearCache()
-                }
-            }
-            // --- End Caching Logic ---
 
-            // Prepare the *full* input (including the latest user message)
-            let lmInput = try await context.processor.prepare(input: userInput)
-            // Set temperature for response randomness (0.7 provides good balance)
+            let fullPrompt = try await context.processor.prepare(input: userInput)
+
             let parameters = GenerateParameters(temperature: 0.7)
 
+            let cache: PromptCache
+            if let existingCache = self.promptCache[model.name] {
+                cache = existingCache
+            } else {
+                // Create cache if it doesn't exist yet
+                cache = PromptCache(cache: context.model.newCache(parameters: parameters))
+                promptCache[model.name] = cache
+            }
+
+            let lmInput: LMInput
+            
+            /// Remove prefix from prompt that is already in cache
+            if let suffix = await cache.getUncachedSuffix(prompt: fullPrompt.text.tokens) {
+                lmInput = LMInput(text: LMInput.Text(tokens: suffix))
+            } else {
+                // If suffix is nil, the cache is inconsistent with the new prompt
+                // and the cache doesn't support trimming so create a new one here.
+                self.promptCache[model.name] = PromptCache(cache: context.model.newCache(parameters: parameters))
+                lmInput = fullPrompt
+            }
+            
+            // TODO: cache.perform ...
+            // TODO: The generated tokens should be added to the prompt cache but not possible with AsyncStream
             return try MLXLMCommon.generate(
-                input: lmInput, parameters: parameters, context: context)
+                input: lmInput, parameters: parameters, context: context, cache: await cache.cache)
         }
     }
 }
diff --git a/Libraries/MLXLMCommon/ModelFactory.swift b/Libraries/MLXLMCommon/ModelFactory.swift
@@ -33,7 +33,6 @@ public struct ModelContext {
     public var model: any LanguageModel
     public var processor: any UserInputProcessor
     public var tokenizer: Tokenizer
-    public var kvCache: [KVCache]? = nil
 
     public init(
         configuration: ModelConfiguration, model: any LanguageModel,
diff --git a/Libraries/MLXLMCommon/PromptCache.swift b/Libraries/MLXLMCommon/PromptCache.swift
@@ -0,0 +1,100 @@
+//
+//  PromptCache.swift
+//  mlx-swift-examples
+//
+//  Created by Jolon Faichney on 3/5/2025.
+//
+
+import MLX
+
+public actor PromptCache {
+    public let cache: [KVCache]
+    public var tokens: MLXArray
+
+    public init(cache: [KVCache]) {
+        print("[PromptCache.init]")
+        self.cache = cache
+        self.tokens = []
+    }
+
+    /// Returns the suffix of the prompt not already in cache, so that only
+    /// the new part is processed. The tokens of the cache are adjusted here
+    /// to reflect the new full prompt (i.e. the suffix tokens are added to the
+    /// cache tokens array), assuming that the prompt suffix will
+    /// be processed after the call to this function.
+    ///
+    /// Trims cache if necessary if part of the cache doesn't match the new
+    /// prompt. If the model doesn't support trimming and the cache needs to be
+    /// trimmed, will return nil for the caller to create a new cache.
+    ///
+    /// - Returns:
+    ///     - If entirety of cache is in the new prompt:
+    ///         - Return suffix of new prompt, less what is in the cache
+    ///     - If only a portion of the cache is in the new prompt:
+    ///         - Attempt to trim the cache to the common prefix
+    ///         - Return suffix of prompt not in cache
+    ///         - If the cache is not trimmable return nil for the caller
+    ///             to create a new cache.
+    public func getUncachedSuffix(prompt: MLXArray) async -> MLXArray? {
+        
+        print("[getUncachedSuffix] self.tokens.size = \(self.tokens.size)")
+        
+        print("cache[\(self.tokens.size)]: \(self.tokens)")
+        print("prompt[\(prompt.size)]: \(prompt)")
+    
+        let comPrefixLength = commonPrefixLength(newPromptTokens: prompt)
+        print("[getUncachedSuffix] comPrefixLength: \(comPrefixLength)")
+        
+        if comPrefixLength == self.tokens.size {
+            let optPrompt = prompt[comPrefixLength..<prompt.size]
+            print("Concating...")
+            self.tokens = concatenated([self.tokens, optPrompt], axis: 0)
+            return optPrompt
+        }  else if (comPrefixLength < self.tokens.size) {
+            if isTrimmable() {
+                print("trimming: \(self.tokens.size - comPrefixLength)")
+                let trimmedLen = self.trim(self.tokens.size - comPrefixLength)
+                print("trimmed: \(trimmedLen)")
+                if trimmedLen != self.tokens.size - comPrefixLength {
+                    print("Warning: request trimmed amount and actual trimmed amount are different")
+                }
+                self.tokens = self.tokens[0..<comPrefixLength]
+                let optPrompt = prompt[comPrefixLength..<prompt.size]
+                self.tokens = concatenated([self.tokens, optPrompt], axis: 0)
+                return optPrompt
+            } else {
+                // Caller must create a new cache
+                return nil
+            }
+        }
+
+        return nil
+    }
+
+    public func isTrimmable() -> Bool {
+        return cache.allSatisfy { $0.isTrimmable()}
+    }
+
+    public func trim(_ n: Int) -> Int {
+        if !self.isTrimmable(){
+            return 0
+        }
+        return cache.map { $0.trim(n: n) }.max() ?? 0
+    }
+
+    public func commonPrefixLength(newPromptTokens: MLXArray) -> Int {
+        return _commonPrefixLength(self.tokens, newPromptTokens)
+    }
+
+    // TODO: Add tests
+    public func _commonPrefixLength(_ array1: MLXArray, _ array2: MLXArray) -> Int {
+        print("Calculating common prefix: array1[\(array1.size)] array2[\(array2.size)]")
+        let minLength = min(array1.size, array2.size)
+        for i in 0..<minLength {
+            if all(array1[i] .!= array2[i]).item(Bool.self) {
+                return i
+            }
+        }
+        return minLength
+    }
+}