Skip to content

Commit c1deeb4

Browse files
committed
Debug and add to-do for next steps
1 parent 96cdc14 commit c1deeb4

File tree

1 file changed

+21
-12
lines changed

1 file changed

+21
-12
lines changed

Libraries/MLXVLM/Models/Qwen2VL.swift

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -693,18 +693,22 @@ public class Qwen2VLProcessor: UserInputProcessor {
693693
if let role = messages[0]["role"] as? String, role != "system" {
694694
messages.insert(["role": "system", "content": "You are a helpful assistant."], at: 0)
695695
}
696-
// // Add image markers to last message if needed
697-
// if let imageTHW {
698-
// let lastIndex = messages.count - 1
699-
// var content = messages[lastIndex]["content"] ?? ""
700-
// let mergeLength = config.mergeSize * config.mergeSize
701-
// for thw in imageTHW {
702-
// content += "<|vision_start|>"
703-
// content += Array(repeating: "<|image_pad|>", count: thw.product / mergeLength).joined()
704-
// content += "<|vision_end|>"
705-
// }
706-
// messages[lastIndex]["content"] = content
707-
// }
696+
697+
// // Add image markers to last message if needed
698+
// if let imageTHW {
699+
// let lastIndex = messages.count - 1
700+
// var content = messages[lastIndex]["content"] as? String ?? ""
701+
// let mergeLength = config.mergeSize * config.mergeSize
702+
// for thw in imageTHW {
703+
// content += "<|vision_start|>"
704+
// content += Array(repeating: "<|image_pad|>", count: thw.product / mergeLength).joined()
705+
// content += "<|vision_end|>"
706+
// }
707+
// messages[lastIndex]["content"] = content
708+
// }
709+
710+
// TODO: Instead of the above, replace the single `<|image_pad|>` with repeated padding, using the same logic as above to determine the number of repeats.
711+
708712
return messages
709713
}
710714

@@ -730,6 +734,11 @@ public class Qwen2VLProcessor: UserInputProcessor {
730734
// Prepare messages with image markers
731735
let messages = prepareMessages(input.prompt.asMessages(), imageTHW: image.imageGridThw)
732736
let promptTokens = try tokenizer.applyChatTemplate(messages: messages)
737+
738+
// TODO: For debugging. Remove later.
739+
let promptTokensDecoded = try tokenizer.decode(tokens: promptTokens)
740+
print(promptTokensDecoded)
741+
733742
let promptArray = MLXArray(promptTokens).expandedDimensions(axis: 0)
734743
let mask = ones(like: promptArray).asType(.int8)
735744
return LMInput(text: .init(tokens: promptArray, mask: mask), image: image)

0 commit comments

Comments
 (0)