feat: Implement prefill method in ModelContainer

jolonf · jolonf · commit 1275accbea0a · 2025-05-02T21:38:02.000+10:00
diff --git a/Libraries/MLXLMCommon/ModelContainer.swift b/Libraries/MLXLMCommon/ModelContainer.swift
@@ -85,20 +85,58 @@ public actor ModelContainer {
         context.kvCache = nil
     }
 
-    /// Prefills the Key/Value cache with the given prompt tokens.
+    /// Prefills the Key/Value cache by running the model's forward pass
+    /// on the provided tokens.
     ///
-    /// - Parameter promptTokens: The token IDs to prefill the cache with.
-    /// - Note: This requires specific model support to run the forward pass
-    ///   without full generation and extract the cache state.
-    ///   Implementation is pending further model integration.
-    public func prefill(promptTokens: [Int]) async throws {
-        // TODO: Implement prefill logic.
-        // This will involve:
-        // 1. Ensuring the model supports cache extraction.
-        // 2. Running a partial forward pass with promptTokens.
-        // 3. Storing the resulting KVCache in context.kvCache.
-        print("Prefill functionality not yet implemented.")
-        // For now, just clear the cache to avoid using a potentially stale one.
-        clearCache()
+    /// This populates the internal cache state, allowing subsequent `generate` calls
+    /// to start generation immediately after the prefilled tokens without reprocessing them.
+    ///
+    /// - Parameters:
+    ///   - promptTokens: The token IDs to prefill the cache with.
+    ///   - chunkSize: The number of tokens to process in each model evaluation step. Defaults to 512.
+    public func prefill(promptTokens: [Int], chunkSize: Int = 512) {
+        // Ensure we have tokens to process
+        guard !promptTokens.isEmpty else {
+            // If the prompt is empty, ensure the cache is cleared
+            clearCache()
+            return
+        }
+
+        // Create a new cache instance
+        let newCache = context.model.newCache(parameters: nil)
+
+        // Convert tokens to MLXArray
+        var tokensToProcess = MLXArray(promptTokens)
+
+        // Process tokens in chunks
+        var currentOffset = 0
+        var state: LMOutput.State? = nil // Manage state if the model uses it
+
+        while currentOffset < tokensToProcess.size {
+            let endOffset = min(currentOffset + chunkSize, tokensToProcess.size)
+            let chunk = tokensToProcess[currentOffset ..< endOffset]
+
+            // Create LMInput.Text for the chunk
+            // Adding a new axis as models typically expect a batch dimension
+            let inputText = LMInput.Text(tokens: chunk[.newAxis])
+
+            // Run the model's forward pass for the chunk
+            // This implicitly updates the newCache passed to it
+            let result = context.model(inputText, cache: newCache, state: state)
+
+            // Update state if provided by the model
+            state = result.state
+
+            // Move to the next chunk
+            currentOffset = endOffset
+        }
+
+        // Ensure all computations related to cache population are completed
+        eval(newCache)
+
+        // Store the populated cache in the context
+        context.kvCache = newCache
     }
+
+    // TODO: Add trimCache(to offset: Int) method
 }