Decode token in generate() (#260)

ronaldmannak · web-flow · commit e1d3222a0f91 · 2025-04-08T12:35:50.000-07:00
* Decode token in generate()
* Use naiveStramingDetokenizer
diff --git a/Libraries/MLXLMCommon/Evaluate.swift b/Libraries/MLXLMCommon/Evaluate.swift
@@ -723,12 +723,13 @@ public func generate(
             var promptTime: TimeInterval = 0
 
             let additionalEOSTokenIds = Set(
-                (context.configuration.extraEOSTokens ?? [])
+                context.configuration.extraEOSTokens
                     .compactMap {
                         context.tokenizer.convertTokenToId($0)
                     })
 
             var tokenCount = 0
+            var detokenizer = NaiveStreamingDetokenizer(tokenizer: context.tokenizer)
 
             for token in iterator {
 
@@ -748,8 +749,11 @@ public func generate(
                     break
                 }
 
-                tokenCount += 1
-                continuation.yield(.token(token))
+                detokenizer.append(token: token)
+                if let chunk = detokenizer.next() {
+                    tokenCount += 1
+                    continuation.yield(.chunk(chunk))
+                }
             }
 
             let now = Date.timeIntervalSinceReferenceDate
@@ -819,11 +823,11 @@ public struct GenerateCompletionInfo: Sendable {
 /// Represents the different stages or outputs of the token generation process.
 ///
 /// This enum distinguishes between the following:
-/// - `.token`: An individual token generated by the language model.
+/// - `.chunk`: A decoded string from one or more tokens generated by the language model.
 /// - `.info`: Metadata and performance statistics about the generation process.
 public enum Generation {
     /// A generated token represented as an integer.
-    case token(Int)
+    case chunk(String)
     /// Completion information summarizing token counts and performance metrics.
     case info(GenerateCompletionInfo)
 }