@@ -686,69 +686,66 @@ public class Qwen2VLProcessor: UserInputProcessor {
686
686
return ( flattenedPatches, . init( gridT, gridH, gridW) )
687
687
}
688
688
689
- public func prepare( prompt: UserInput . Prompt , imageTHW: [ THW ] ? ) -> String {
690
- // the tokenizer does have a chat template and it expects messages
691
- // like this:
692
- //
693
- // [{'role': 'user', 'content': [{'type': 'text', 'text': 'What are these?'},
694
- // {'type': 'image'}, {'type': 'image'}, {'type': 'image'}]}]
695
- //
696
- // The output of the prompt template is fed into
697
- // image_processing_qwen2_vl.preprocess where it is further augmented
698
- // by replacing tokens according to imageTHW.
699
- //
700
- // Neither the structured content nor the postprocessing of the template
701
- // are supported in current Tokenizer/Jinja (swift) so handle that here.
702
-
689
+ public func prepare( prompt: UserInput . Prompt , imageTHW: [ THW ] ? ) throws -> String {
703
690
var messages = prompt. asMessages ( )
704
691
if messages [ 0 ] [ " role " ] != " system " {
705
692
messages. insert ( [ " role " : " system " , " content " : " You are a helpful assistant. " ] , at: 0 )
706
693
}
707
-
694
+ // For the last message, we need to add image markers to the content
708
695
let lastIndex = messages. count - 1
709
- var lastMessage = messages [ lastIndex] [ " content " ] ?? " "
710
-
711
- // image_processing_qwen2_vl.preprocess -- inject image_pad tokens for each image
696
+ let lastContent = messages [ lastIndex] [ " content " ] ?? " "
697
+ // Build the content string with image markers
712
698
let mergeLength = config. mergeSize * config. mergeSize
713
- for thw in imageTHW ?? [ ] {
714
- lastMessage += " <|vision_start|> "
715
- lastMessage += Array ( repeating: " <|image_pad|> " , count: thw. product / mergeLength)
716
- . joined ( )
717
- lastMessage += " <|vision_end|> "
718
- }
719
-
720
- messages [ lastIndex] [ " content " ] = lastMessage
721
-
722
- return
723
- messages
724
- . map {
725
- " <|im_start|> \( $0 [ " role " ] ?? " user " ) \n \( $0 [ " content " ] ?? " " ) <|im_end|> "
699
+ var content = lastContent
700
+ if let imageTHW = imageTHW {
701
+ for thw in imageTHW {
702
+ content += " <|vision_start|> "
703
+ content += Array ( repeating: " <|image_pad|> " , count: thw. product / mergeLength)
704
+ . joined ( )
705
+ content += " <|vision_end|> "
726
706
}
727
- . joined ( separator: " \n " )
728
- + " \n <|im_start|>assistant \n "
707
+ }
708
+ // Update the last message with the combined content
709
+ messages [ lastIndex] [ " content " ] = content
710
+ let tokens = try tokenizer. applyChatTemplate ( messages: messages)
711
+ return tokenizer. decode ( tokens: tokens)
729
712
}
730
713
731
714
public func prepare( input: UserInput ) throws -> LMInput {
715
+ // Text-only input
732
716
if input. images. isEmpty {
733
- // just a straight text prompt
734
- let prompt = prepare ( prompt: input. prompt, imageTHW: nil )
735
- let promptTokens = try tokenizer. encode ( text: prompt)
717
+ let messages = input. prompt. asMessages ( )
718
+ let promptTokens = try tokenizer. applyChatTemplate ( messages: messages)
736
719
return LMInput ( tokens: MLXArray ( promptTokens) )
737
720
}
738
-
739
- // image_processing_qwen2_vl.preprocess
721
+ // Input with images
740
722
let images = try input. images. map {
741
723
try preprocess ( images: [ $0. asCIImage ( ) ] , processing: input. processing)
742
724
}
743
725
let pixels = concatenated ( images. map { $0. 0 } )
744
726
let image = LMInput . ProcessedImage ( pixels: pixels, imageGridThw: images. map { $0. 1 } )
745
-
746
- // processing_qwen2_vl.Qwen2VLProcessor
747
- let prompt = prepare ( prompt: input. prompt, imageTHW: image. imageGridThw)
748
- let promptTokens = try tokenizer. encode ( text: prompt)
727
+ // Create structured messages with image markers
728
+ var messages = input. prompt. asMessages ( )
729
+ if messages [ 0 ] [ " role " ] != " system " {
730
+ messages. insert ( [ " role " : " system " , " content " : " You are a helpful assistant. " ] , at: 0 )
731
+ }
732
+ // Structure the last message to include both text and image markers
733
+ let lastIndex = messages. count - 1
734
+ let lastContent = messages [ lastIndex] [ " content " ] ?? " "
735
+ // Build the content string with image markers
736
+ let mergeLength = config. mergeSize * config. mergeSize
737
+ var content = lastContent
738
+ for thw in image. imageGridThw ?? [ ] {
739
+ content += " <|vision_start|> "
740
+ content += Array ( repeating: " <|image_pad|> " , count: thw. product / mergeLength) . joined ( )
741
+ content += " <|vision_end|> "
742
+ }
743
+ // Update the last message with the combined content
744
+ messages [ lastIndex] [ " content " ] = content
745
+ // Use the chat template to generate the prompt
746
+ let promptTokens = try tokenizer. applyChatTemplate ( messages: messages)
749
747
let promptArray = MLXArray ( promptTokens) . expandedDimensions ( axis: 0 )
750
748
let mask = ones ( like: promptArray) . asType ( . int8)
751
-
752
749
return LMInput ( text: . init( tokens: promptArray, mask: mask) , image: image)
753
750
}
754
751
0 commit comments