Moved PromptCache.swift to MLXChatExample and changed it to an @unchecked Sendable class.

jolonf · jolonf · commit 07867bee7ff8 · 2025-05-05T06:12:54.000+10:00
diff --git a/Applications/MLXChatExample/Models/PromptCache.swift b/Applications/MLXChatExample/Models/PromptCache.swift
@@ -6,10 +6,18 @@
 //
 
 import MLX
+import MLXLMCommon
 
-public actor PromptCache {
-    public let cache: [KVCache]
-    public var tokens: MLXArray
+/// Stores the KV Cache between calls to ``generate`` and maintains
+/// the token ids reflected in the cache.
+///
+/// ``PromptCache`` is ``@unchecked Sendable`` which allows it
+/// to be used within the ``ModelContainer`` context.
+///
+/// TODO: cache isolation
+public class PromptCache: @unchecked Sendable {
+    private(set) var cache: [KVCache]
+    private(set) var tokens: MLXArray
 
     public init(cache: [KVCache]) {
         print("[PromptCache.init]")
@@ -35,7 +43,7 @@ public actor PromptCache {
     ///         - Return suffix of prompt not in cache
     ///         - If the cache is not trimmable return nil for the caller
     ///             to create a new cache.
-    public func getUncachedSuffix(prompt: MLXArray) async -> MLXArray? {
+    public func getUncachedSuffix(prompt: MLXArray) -> MLXArray? {
         
         print("[getUncachedSuffix] self.tokens.size = \(self.tokens.size)")
         
@@ -71,30 +79,45 @@ public actor PromptCache {
         return nil
     }
 
+    /// - Returns: true if all KV caches are trimmable
     public func isTrimmable() -> Bool {
         return cache.allSatisfy { $0.isTrimmable()}
     }
 
+    /// Trims all KV caches.
+    /// - Parameters:
+    ///   - n: Amount to trim.
+    /// - Returns: Amount KV Caches were trimmed (may be less than ``n``).
     public func trim(_ n: Int) -> Int {
         if !self.isTrimmable(){
             return 0
         }
         return cache.map { $0.trim(n: n) }.max() ?? 0
     }
 
+    /// Finds the common prefix between the cached prompt and
+    /// the new prompt.
+    /// - Parameters:
+    ///   - newPromptTokens: Tokens to compare with cached tokens.
+    /// - Returns: Length of the common prefix
     public func commonPrefixLength(newPromptTokens: MLXArray) -> Int {
-        return _commonPrefixLength(self.tokens, newPromptTokens)
+        return MLX_Studio.commonPrefixLength(self.tokens, newPromptTokens)
     }
+}
 
-    // TODO: Add tests
-    public func _commonPrefixLength(_ array1: MLXArray, _ array2: MLXArray) -> Int {
-        print("Calculating common prefix: array1[\(array1.size)] array2[\(array2.size)]")
-        let minLength = min(array1.size, array2.size)
-        for i in 0..<minLength {
-            if all(array1[i] .!= array2[i]).item(Bool.self) {
-                return i
-            }
+/// Finds the common prefix between ``MLXArray``s.
+/// - Parameters:
+///   - array1: First array
+///   - array2: Second array
+/// - Returns: Length of the common prefix
+public func commonPrefixLength(_ array1: MLXArray, _ array2: MLXArray) -> Int {
+    // TODO: Add test cases
+    print("Calculating common prefix: array1[\(array1.size)] array2[\(array2.size)]")
+    let minLength = min(array1.size, array2.size)
+    for i in 0..<minLength {
+        if all(array1[i] .!= array2[i]).item(Bool.self) {
+            return i
         }
-        return minLength
     }
+    return minLength
 }
diff --git a/Applications/MLXChatExample/Services/MLXService.swift b/Applications/MLXChatExample/Services/MLXService.swift
@@ -10,7 +10,6 @@ import MLX
 import MLXLLM
 import MLXLMCommon
 import MLXVLM
-import Tokenizers // Needed for applyChatTemplate
 
 /// A service class that manages machine learning models for text and vision-language tasks.
 /// This class handles model loading, caching, and text generation using various LLM and VLM models.
@@ -119,6 +118,7 @@ class MLXService {
 
             let parameters = GenerateParameters(temperature: 0.7)
 
+            // Get the prompt cache
             let cache: PromptCache
             if let existingCache = self.promptCache[model.name] {
                 cache = existingCache
@@ -131,7 +131,7 @@ class MLXService {
             let lmInput: LMInput
             
             /// Remove prefix from prompt that is already in cache
-            if let suffix = await cache.getUncachedSuffix(prompt: fullPrompt.text.tokens) {
+            if let suffix = cache.getUncachedSuffix(prompt: fullPrompt.text.tokens) {
                 lmInput = LMInput(text: LMInput.Text(tokens: suffix))
             } else {
                 // If suffix is nil, the cache is inconsistent with the new prompt
@@ -143,7 +143,7 @@ class MLXService {
             // TODO: cache.perform ...
             // TODO: The generated tokens should be added to the prompt cache but not possible with AsyncStream
             return try MLXLMCommon.generate(
-                input: lmInput, parameters: parameters, context: context, cache: await cache.cache)
+                input: lmInput, parameters: parameters, context: context, cache: cache.cache)
         }
     }
 }
diff --git a/Libraries/MLXLMCommon/Evaluate.swift b/Libraries/MLXLMCommon/Evaluate.swift
@@ -529,6 +529,7 @@ public func generate(
 ///   - input: prepared language model input
 ///   - parameters: parameters controlling the token generation
 ///   - context: model context (model and tokenizer)
+///   - cache: KV cache from previous output
 ///   - didGenerate: token visitor that can output tokens as they are generated and indicate early stop
 /// - Returns: the generated output
 public func generate(
@@ -626,6 +627,7 @@ public func generate(
 ///   - input: prepared language model input
 ///   - parameters: parameters controlling the token generation
 ///   - context: model context (model and tokenizer)
+///   - cache: KV cache from previous output
 ///   - didGenerate: token visitor that can output tokens as they are generated and indicate early stop
 /// - Returns: Information about the generation
 public func generate(
@@ -702,6 +704,7 @@ public func generate(
 ///   - input: The input for the language model.
 ///   - parameters: The configuration options for token generation.
 ///   - context: The model context, including the model itself and associated tokenizer.
+///   - cache: KV cache from previous output
 /// - Returns: An `AsyncStream` that emits `Generation` values, including generated tokens (`.token`)
 ///   and completion information (`.info`).
 /// - Throws: An error if the `TokenIterator` initialization fails due to invalid input or model configuration.
diff --git a/Libraries/MLXLMCommon/ModelContainer.swift b/Libraries/MLXLMCommon/ModelContainer.swift
@@ -79,5 +79,4 @@ public actor ModelContainer {
     public func update(_ action: @Sendable (inout ModelContext) -> Void) {
         action(&context)
     }
-
 }

Original file line number	Diff line number	Diff line change
`@@ -79,5 +79,4 @@ public actor ModelContainer {`
`79`	`79`	`public func update(_ action: @Sendable (inout ModelContext) -> Void) {`
`80`	`80`	`action(&context)`
`81`	`81`	`}`
`82`		`-`
`83`	`82`	`}`