feat: Implement prompt caching in MLXChatExample

jolonf · jolonf · commit cb7d9d4ba3fc · 2025-05-02T22:46:34.000+10:00
Made prefill async, implemented caching logic in MLXService, and fixed related warnings.
diff --git a/Applications/MLXChatExample/Services/MLXService.swift b/Applications/MLXChatExample/Services/MLXService.swift
@@ -10,6 +10,7 @@ import MLX
 import MLXLLM
 import MLXLMCommon
 import MLXVLM
+import Tokenizers // Needed for applyChatTemplate
 
 /// A service class that manages machine learning models for text and vision-language tasks.
 /// This class handles model loading, caching, and text generation using various LLM and VLM models.
@@ -34,6 +35,9 @@ class MLXService {
     /// Cache to store loaded model containers to avoid reloading.
     private let modelCache = NSCache<NSString, ModelContainer>()
 
+    /// Tracks the ID of the last model used to detect changes for cache invalidation.
+    private var lastUsedModelId: String?
+
     /// Tracks the current model download progress.
     /// Access this property to monitor model download status.
     @MainActor
@@ -86,6 +90,14 @@ class MLXService {
         // Load or retrieve model from cache
         let modelContainer = try await load(model: model)
 
+        // Check if the model has changed since last generation
+        if lastUsedModelId != model.name {
+            // Clear the cache if the model is different
+            await modelContainer.clearCache()
+            print("[MLXService] Model changed, cleared KV Cache.")
+            lastUsedModelId = model.name
+        }
+
         // Map app-specific Message type to Chat.Message for model input
         let chat = messages.map { message in
             let role: Chat.Message.Role =
@@ -111,6 +123,42 @@ class MLXService {
 
         // Generate response using the model
         return try await modelContainer.perform { (context: ModelContext) in
+            // --- Prompt Caching Logic ---
+            // Only prefill if there are more than just the initial system message and the current turn
+            // (user + empty assistant = 2). Assumes first message is system.
+            if messages.count > 3 {
+                // Prepare history: all messages except the last (empty assistant) one.
+                // The `processor.prepare` below handles the *full* input including the latest user message.
+                let historyMessages = Array(chat.dropLast()) // Drop the empty assistant message
+                let historyUserInput = UserInput(chat: historyMessages)
+
+                // Try to get history tokens. Need tokenizer from context.
+                do {
+                    // Attempt to use the processor first, as it might handle VLM details.
+                    // Note: This runs prepare twice (once for history, once for full), which is suboptimal.
+                    // A better approach might involve direct tokenizer access or a dedicated history tokenization method.
+                    let historyLmInput = try await context.processor.prepare(input: historyUserInput)
+                    let historyTokens = historyLmInput.text.tokens.asArray(Int.self)
+
+                    // Check if current cache offset matches history length
+                    let currentCacheOffset = context.kvCache?.first?.offset ?? 0 // Assuming single cache for now
+
+                    if currentCacheOffset != historyTokens.count {
+                        print("[MLXService] Prefilling cache for \(historyTokens.count) history tokens. Current cache offset: \(currentCacheOffset)")
+                        await modelContainer.prefill(promptTokens: historyTokens)
+                    } else {
+                        print("[MLXService] Cache already matches history length (\(currentCacheOffset) tokens). Skipping prefill.")
+                    }
+                } catch {
+                    // Fallback or error handling if history tokenization fails
+                    print("[MLXService] Warning: Could not prepare history tokens for prefill. Error: \(error). Proceeding without prefill.")
+                    // Ensure cache is clear if we couldn't reliably check/prefill
+                    await modelContainer.clearCache()
+                }
+            }
+            // --- End Caching Logic ---
+
+            // Prepare the *full* input (including the latest user message)
             let lmInput = try await context.processor.prepare(input: userInput)
             // Set temperature for response randomness (0.7 provides good balance)
             let parameters = GenerateParameters(temperature: 0.7)
diff --git a/Libraries/MLXLMCommon/ModelContainer.swift b/Libraries/MLXLMCommon/ModelContainer.swift
@@ -94,7 +94,7 @@ public actor ModelContainer {
     /// - Parameters:
     ///   - promptTokens: The token IDs to prefill the cache with.
     ///   - chunkSize: The number of tokens to process in each model evaluation step. Defaults to 512.
-    public func prefill(promptTokens: [Int], chunkSize: Int = 512) {
+    public func prefill(promptTokens: [Int], chunkSize: Int = 512) async {
         // Ensure we have tokens to process
         guard !promptTokens.isEmpty else {
             // If the prompt is empty, ensure the cache is cleared
@@ -106,7 +106,7 @@ public actor ModelContainer {
         let newCache = context.model.newCache(parameters: nil)
 
         // Convert tokens to MLXArray
-        var tokensToProcess = MLXArray(promptTokens)
+        let tokensToProcess = MLXArray(promptTokens)
 
         // Process tokens in chunks
         var currentOffset = 0