Added actor PromptCache

jolonf · jolonf · commit a675b2a5a9b3 · 2025-05-04T11:30:42.000+10:00
diff --git a/Libraries/MLXLMCommon/Evaluate.swift b/Libraries/MLXLMCommon/Evaluate.swift
@@ -526,17 +526,17 @@ public func generate(
 /// ``generate(input:context:iterator:didGenerate:)``
 ///
 /// - Parameters:
-///   - input: language model input
+///   - input: prepared language model input
 ///   - parameters: parameters controlling the token generation
 ///   - context: model context (model and tokenizer)
 ///   - didGenerate: token visitor that can output tokens as they are generated and indicate early stop
 /// - Returns: the generated output
 public func generate(
-    input: LMInput, parameters: GenerateParameters, context: ModelContext,
+    input: LMInput, parameters: GenerateParameters, context: ModelContext, cache: [KVCache]? = nil,
     didGenerate: ([Int]) -> GenerateDisposition
 ) throws -> GenerateResult {
     let iterator = try TokenIterator(
-        input: input, model: context.model, cache: context.kvCache, parameters: parameters)
+        input: input, model: context.model, cache: cache, parameters: parameters)
     return generate(
         input: input, context: context, iterator: iterator, didGenerate: didGenerate)
 }
@@ -629,11 +629,11 @@ public func generate(
 ///   - didGenerate: token visitor that can output tokens as they are generated and indicate early stop
 /// - Returns: Information about the generation
 public func generate(
-    input: LMInput, parameters: GenerateParameters, context: ModelContext,
+    input: LMInput, parameters: GenerateParameters, context: ModelContext, cache: [KVCache]? = nil,
     didGenerate: (Int) -> GenerateDisposition
 ) throws -> GenerateCompletionInfo {
     let iterator = try TokenIterator(
-        input: input, model: context.model, cache: context.kvCache, parameters: parameters)
+        input: input, model: context.model, cache: cache, parameters: parameters)
     return generate(
         input: input, context: context, iterator: iterator, didGenerate: didGenerate)
 }
@@ -729,10 +729,10 @@ public func generate(
 /// }
 /// ```
 public func generate(
-    input: LMInput, parameters: GenerateParameters, context: ModelContext
+    input: LMInput, parameters: GenerateParameters, context: ModelContext, cache: [KVCache]?
 ) throws -> AsyncStream<Generation> {
     let iterator = try TokenIterator(
-        input: input, model: context.model, cache: context.kvCache, parameters: parameters)
+        input: input, model: context.model, cache: cache, parameters: parameters)
     return generate(
         input: input, context: context, iterator: iterator)
 }
diff --git a/Libraries/MLXLMCommon/KVCache.swift b/Libraries/MLXLMCommon/KVCache.swift
@@ -12,6 +12,10 @@ public protocol KVCache: Evaluatable {
     var offset: Int { get }
 
     func update(keys: MLXArray, values: MLXArray) -> (MLXArray, MLXArray)
+    
+    func isTrimmable() -> Bool
+    
+    func trim(n: Int) -> Int
 }
 
 func createAdditiveCausalMask(n: Int, offset: Int) -> MLXArray {
@@ -96,5 +100,14 @@ public class KVCacheSimple: KVCache, Evaluatable {
             self.values![.ellipsis, ..<self.offset, 0...]
         )
     }
-
+    
+    public func isTrimmable() -> Bool {
+        return true
+    }
+    
+    public func trim(n: Int) -> Int {
+        let toTrim = min(self.offset, n)
+        self.offset -= toTrim
+        return toTrim
+    }
 }
diff --git a/Libraries/MLXLMCommon/ModelContainer.swift b/Libraries/MLXLMCommon/ModelContainer.swift
@@ -80,63 +80,4 @@ public actor ModelContainer {
         action(&context)
     }
 
-    /// Clears the Key/Value cache stored within the model context.
-    public func clearCache() {
-        context.kvCache = nil
-    }
-
-    /// Prefills the Key/Value cache by running the model's forward pass
-    /// on the provided tokens.
-    ///
-    /// This populates the internal cache state, allowing subsequent `generate` calls
-    /// to start generation immediately after the prefilled tokens without reprocessing them.
-    ///
-    /// - Parameters:
-    ///   - promptTokens: The token IDs to prefill the cache with.
-    ///   - chunkSize: The number of tokens to process in each model evaluation step. Defaults to 512.
-    public func prefill(promptTokens: [Int], chunkSize: Int = 512) async {
-        // Ensure we have tokens to process
-        guard !promptTokens.isEmpty else {
-            // If the prompt is empty, ensure the cache is cleared
-            clearCache()
-            return
-        }
-
-        // Create a new cache instance
-        let newCache = context.model.newCache(parameters: nil)
-
-        // Convert tokens to MLXArray
-        let tokensToProcess = MLXArray(promptTokens)
-
-        // Process tokens in chunks
-        var currentOffset = 0
-        var state: LMOutput.State? = nil // Manage state if the model uses it
-
-        while currentOffset < tokensToProcess.size {
-            let endOffset = min(currentOffset + chunkSize, tokensToProcess.size)
-            let chunk = tokensToProcess[currentOffset ..< endOffset]
-
-            // Create LMInput.Text for the chunk
-            // Adding a new axis as models typically expect a batch dimension
-            let inputText = LMInput.Text(tokens: chunk[.newAxis])
-
-            // Run the model's forward pass for the chunk
-            // This implicitly updates the newCache passed to it
-            let result = context.model(inputText, cache: newCache, state: state)
-
-            // Update state if provided by the model
-            state = result.state
-
-            // Move to the next chunk
-            currentOffset = endOffset
-        }
-
-        // Ensure all computations related to cache population are completed
-        eval(newCache)
-
-        // Store the populated cache in the context
-        context.kvCache = newCache
-    }
-
-    // TODO: Add trimCache(to offset: Int) method
 }

Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,10 @@ public protocol KVCache: Evaluatable {`
`12`	`12`	`var offset: Int { get }`
`13`	`13`
`14`	`14`	`func update(keys: MLXArray, values: MLXArray) -> (MLXArray, MLXArray)`
	`15`	`+`
	`16`	`+ func isTrimmable() -> Bool`
	`17`	`+`
	`18`	`+ func trim(n: Int) -> Int`
`15`	`19`	`}`
`16`	`20`
`17`	`21`	`func createAdditiveCausalMask(n: Int, offset: Int) -> MLXArray {`
`@@ -96,5 +100,14 @@ public class KVCacheSimple: KVCache, Evaluatable {`
`96`	`100`	`self.values![.ellipsis, ..<self.offset, 0...]`
`97`	`101`	`)`
`98`	`102`	`}`
`99`		`-`
	`103`	`+`
	`104`	`+ public func isTrimmable() -> Bool {`
	`105`	`+ return true`
	`106`	`+ }`
	`107`	`+`
	`108`	`+ public func trim(n: Int) -> Int {`
	`109`	`+ let toTrim = min(self.offset, n)`
	`110`	`+ self.offset -= toTrim`
	`111`	`+ return toTrim`
	`112`	`+ }`
`100`	`113`	`}`