Tidy + modelCache and promptCache in MLXChatExample changed to NSCache.

jolonf · jolonf · commit 6f48543dda7b · 2025-05-05T10:03:04.000+10:00
diff --git a/Applications/MLXChatExample/Models/PromptCache.swift b/Applications/MLXChatExample/Models/PromptCache.swift
@@ -101,23 +101,24 @@ public class PromptCache: @unchecked Sendable {
     ///   - newPromptTokens: Tokens to compare with cached tokens.
     /// - Returns: Length of the common prefix
     public func commonPrefixLength(newPromptTokens: MLXArray) -> Int {
-        return MLX_Studio.commonPrefixLength(self.tokens, newPromptTokens)
+        return commonPrefixLength(self.tokens, newPromptTokens)
     }
-}
-
-/// Finds the common prefix between ``MLXArray``s.
-/// - Parameters:
-///   - array1: First array
-///   - array2: Second array
-/// - Returns: Length of the common prefix
-public func commonPrefixLength(_ array1: MLXArray, _ array2: MLXArray) -> Int {
-    // TODO: Add test cases
-    print("Calculating common prefix: array1[\(array1.size)] array2[\(array2.size)]")
-    let minLength = min(array1.size, array2.size)
-    for i in 0..<minLength {
-        if all(array1[i] .!= array2[i]).item(Bool.self) {
-            return i
+    
+    /// Finds the common prefix between ``MLXArray``s.
+    /// - Parameters:
+    ///   - array1: First array
+    ///   - array2: Second array
+    /// - Returns: Length of the common prefix
+    public func commonPrefixLength(_ array1: MLXArray, _ array2: MLXArray) -> Int {
+        // TODO: Add test cases
+        print("Calculating common prefix: array1[\(array1.size)] array2[\(array2.size)]")
+        let minLength = min(array1.size, array2.size)
+        for i in 0..<minLength {
+            if all(array1[i] .!= array2[i]).item(Bool.self) {
+                return i
+            }
         }
+        return minLength
     }
-    return minLength
+
 }
diff --git a/Applications/MLXChatExample/Services/MLXService.swift b/Applications/MLXChatExample/Services/MLXService.swift
@@ -32,10 +32,10 @@ class MLXService {
     ]
 
     /// Cache to store loaded model containers to avoid reloading.
-    private var modelCache: [String : ModelContainer] = [:]
+    private let modelCache = NSCache<NSString, ModelContainer>()
 
     /// Stores a prompt cache for each loaded model
-    private var promptCache: [String : PromptCache] = [:]
+    private let promptCache = NSCache<NSString, PromptCache>()
 
     /// Tracks the current model download progress.
     /// Access this property to monitor model download status.
@@ -51,9 +51,10 @@ class MLXService {
         MLX.GPU.set(cacheLimit: 20 * 1024 * 1024)
 
         // Return cached model if available to avoid reloading
-        if let container = modelCache[model.name] {
+        if let container = modelCache.object(forKey: model.name as NSString) {
             return container
         } else {
+            print("Model not loaded \(model.name), loading model...")
             // Select appropriate factory based on model type
             let factory: ModelFactory =
                 switch model.type {
@@ -71,9 +72,13 @@ class MLXService {
                     self.modelDownloadProgress = progress
                 }
             }
-
+            
+            // Clear out the promptCache
+            promptCache.removeObject(forKey: model.name as NSString)
+            
             // Cache the loaded model for future use
-            modelCache[model.name] = container
+            modelCache.setObject(container, forKey: model.name as NSString)
+            
             return container
         }
     }
@@ -118,32 +123,41 @@ class MLXService {
 
             let parameters = GenerateParameters(temperature: 0.7)
 
-            // Get the prompt cache
-            let cache: PromptCache
-            if let existingCache = self.promptCache[model.name] {
-                cache = existingCache
-            } else {
-                // Create cache if it doesn't exist yet
-                cache = PromptCache(cache: context.model.newCache(parameters: parameters))
-                promptCache[model.name] = cache
-            }
-
-            let lmInput: LMInput
-            
-            /// Remove prefix from prompt that is already in cache
-            if let suffix = cache.getUncachedSuffix(prompt: fullPrompt.text.tokens) {
-                lmInput = LMInput(text: LMInput.Text(tokens: suffix))
-            } else {
-                // If suffix is nil, the cache is inconsistent with the new prompt
-                // and the cache doesn't support trimming so create a new one here.
-                self.promptCache[model.name] = PromptCache(cache: context.model.newCache(parameters: parameters))
-                lmInput = fullPrompt
-            }
+            // TODO: Prompt cache access isn't isolated
+            // Get the prompt cache and adjust new prompt to remove
+            // prefix already in cache, trim cache if cache is
+            // inconsistent with new prompt.
+            let (cache, lmInput) = getPromptCache(fullPrompt: fullPrompt, parameters: parameters, context: context, modelName: model.name)
             
-            // TODO: cache.perform ...
             // TODO: The generated tokens should be added to the prompt cache but not possible with AsyncStream
             return try MLXLMCommon.generate(
                 input: lmInput, parameters: parameters, context: context, cache: cache.cache)
         }
     }
+    
+    func getPromptCache(fullPrompt: LMInput, parameters: GenerateParameters, context: ModelContext, modelName: String) -> (PromptCache, LMInput) {
+        let cache: PromptCache
+        if let existingCache = promptCache.object(forKey: modelName as NSString) {
+            cache = existingCache
+        } else {
+            // Create cache if it doesn't exist yet
+            cache = PromptCache(cache: context.model.newCache(parameters: parameters))
+            self.promptCache.setObject(cache, forKey: modelName as NSString)
+        }
+
+        let lmInput: LMInput
+        
+        /// Remove prefix from prompt that is already in cache
+        if let suffix = cache.getUncachedSuffix(prompt: fullPrompt.text.tokens) {
+            lmInput = LMInput(text: LMInput.Text(tokens: suffix))
+        } else {
+            // If suffix is nil, the cache is inconsistent with the new prompt
+            // and the cache doesn't support trimming so create a new one here.
+            let newCache = PromptCache(cache: context.model.newCache(parameters: parameters))
+            self.promptCache.setObject(newCache, forKey: modelName as NSString)
+            lmInput = fullPrompt
+        }
+        
+        return (cache, lmInput)
+    }
 }
diff --git a/Libraries/MLXLMCommon/Evaluate.swift b/Libraries/MLXLMCommon/Evaluate.swift
@@ -732,7 +732,7 @@ public func generate(
 /// }
 /// ```
 public func generate(
-    input: LMInput, parameters: GenerateParameters, context: ModelContext, cache: [KVCache]?
+    input: LMInput, parameters: GenerateParameters, context: ModelContext, cache: [KVCache]? = nil
 ) throws -> AsyncStream<Generation> {
     let iterator = try TokenIterator(
         input: input, model: context.model, cache: cache, parameters: parameters)