Skip to content

Commit b97db61

Browse files
authored
Refactor VLMEval to use structured messages (#300)
1 parent 38c3715 commit b97db61

File tree

1 file changed

+16
-52
lines changed

1 file changed

+16
-52
lines changed

Applications/VLMEval/ContentView.swift

Lines changed: 16 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -395,58 +395,22 @@ class VLMEvaluator {
395395
MLXRandom.seed(UInt64(Date.timeIntervalSinceReferenceDate * 1000))
396396

397397
try await modelContainer.perform { (context: ModelContext) -> Void in
398-
399-
let images: [UserInput.Image] =
400-
if let image {
401-
[UserInput.Image.ciImage(image)]
402-
} else {
403-
[]
404-
}
405-
let videos: [UserInput.Video] =
406-
if let videoURL {
407-
[.url(videoURL)]
408-
} else {
409-
[]
410-
}
411-
let messages: [[String: Any]] =
412-
if !images.isEmpty || !videos.isEmpty {
413-
[
414-
[
415-
"role": "system",
416-
"content": [
417-
[
418-
"type": "text",
419-
"text": videoURL != nil
420-
? videoSystemPrompt : imageSystemPrompt,
421-
]
422-
],
423-
],
424-
[
425-
"role": "user",
426-
"content": [
427-
[
428-
"type": "text",
429-
"text": prompt,
430-
]
431-
]
432-
// Messages format for Qwen 2 VL, Qwen 2.5 VL. May need to be adapted for other models.
433-
+ images.map { _ in
434-
["type": "image"]
435-
}
436-
+ videos.map { _ in
437-
["type": "video"]
438-
},
439-
],
440-
]
441-
} else {
442-
[
443-
[
444-
"role": "user",
445-
"content": prompt,
446-
]
447-
]
448-
}
449-
var userInput = UserInput(messages: messages, images: images, videos: videos)
398+
let images: [UserInput.Image] = if let image { [.ciImage(image)] } else { [] }
399+
let videos: [UserInput.Video] = if let videoURL { [.url(videoURL)] } else { [] }
400+
401+
let systemPrompt =
402+
if !videos.isEmpty {
403+
videoSystemPrompt
404+
} else if !images.isEmpty {
405+
imageSystemPrompt
406+
} else { "You are a helpful assistant." }
407+
408+
let chat: [Chat.Message] = [
409+
.system(systemPrompt),
410+
.user(prompt, images: images, videos: videos),
411+
]
412+
413+
var userInput = UserInput(chat: chat)
450414
userInput.processing.resize = .init(width: 448, height: 448)
451415

452416
let lmInput = try await context.processor.prepare(input: userInput)

0 commit comments

Comments
 (0)