@@ -693,18 +693,22 @@ public class Qwen2VLProcessor: UserInputProcessor {
693
693
if let role = messages [ 0 ] [ " role " ] as? String , role != " system " {
694
694
messages. insert ( [ " role " : " system " , " content " : " You are a helpful assistant. " ] , at: 0 )
695
695
}
696
- // // Add image markers to last message if needed
697
- // if let imageTHW {
698
- // let lastIndex = messages.count - 1
699
- // var content = messages[lastIndex]["content"] ?? ""
700
- // let mergeLength = config.mergeSize * config.mergeSize
701
- // for thw in imageTHW {
702
- // content += "<|vision_start|>"
703
- // content += Array(repeating: "<|image_pad|>", count: thw.product / mergeLength).joined()
704
- // content += "<|vision_end|>"
705
- // }
706
- // messages[lastIndex]["content"] = content
707
- // }
696
+
697
+ // // Add image markers to last message if needed
698
+ // if let imageTHW {
699
+ // let lastIndex = messages.count - 1
700
+ // var content = messages[lastIndex]["content"] as? String ?? ""
701
+ // let mergeLength = config.mergeSize * config.mergeSize
702
+ // for thw in imageTHW {
703
+ // content += "<|vision_start|>"
704
+ // content += Array(repeating: "<|image_pad|>", count: thw.product / mergeLength).joined()
705
+ // content += "<|vision_end|>"
706
+ // }
707
+ // messages[lastIndex]["content"] = content
708
+ // }
709
+
710
+ // TODO: Instead of the above, replace the single `<|image_pad|>` with repeated padding, using the same logic as above to determine the number of repeats.
711
+
708
712
return messages
709
713
}
710
714
@@ -730,6 +734,11 @@ public class Qwen2VLProcessor: UserInputProcessor {
730
734
// Prepare messages with image markers
731
735
let messages = prepareMessages ( input. prompt. asMessages ( ) , imageTHW: image. imageGridThw)
732
736
let promptTokens = try tokenizer. applyChatTemplate ( messages: messages)
737
+
738
+ // TODO: For debugging. Remove later.
739
+ let promptTokensDecoded = try tokenizer. decode ( tokens: promptTokens)
740
+ print ( promptTokensDecoded)
741
+
733
742
let promptArray = MLXArray ( promptTokens) . expandedDimensions ( axis: 0 )
734
743
let mask = ones ( like: promptArray) . asType ( . int8)
735
744
return LMInput ( text: . init( tokens: promptArray, mask: mask) , image: image)
0 commit comments