|  | 
| 22 | 22 |     Qwen3VLCausalLMOutputWithPast, | 
| 23 | 23 |     Qwen3VLForConditionalGeneration, | 
| 24 | 24 |     Qwen3VLModel, | 
|  | 25 | +    Qwen3VLModelOutputWithPast, | 
| 25 | 26 | ) | 
| 26 | 27 | from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor | 
| 27 | 28 | 
 | 
| @@ -203,9 +204,7 @@ def _get_input_embeds( | 
| 203 | 204 |         deepstack_visual_embeds = deepstack_video_embeds | 
| 204 | 205 | 
 | 
| 205 | 206 |     if pixel_values is None and pixel_values_videos is None: | 
| 206 |  | -        config = model.config.vision_config | 
| 207 |  | -        patch_dim = config.in_channels * config.temporal_patch_size * config.patch_size**2 | 
| 208 |  | -        pixel_values = torch.zeros((16, patch_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device) | 
|  | 207 | +        pixel_values = torch.zeros((16, 1176), dtype=inputs_embeds.dtype, device=inputs_embeds.device) | 
| 209 | 208 |         image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device) | 
| 210 | 209 |         image_embeds, _ = model.visual(pixel_values, grid_thw=image_grid_thw) | 
| 211 | 210 |         inputs_embeds += 0.0 * image_embeds.mean() | 
| @@ -241,7 +240,8 @@ def qwen3_vl_base_forward( | 
| 241 | 240 |         self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw | 
| 242 | 241 |     ) | 
| 243 | 242 |     kwargs.update(input_kwargs)  # avoid lora module to have multiple keyword arguments | 
| 244 |  | -    return self.language_model(input_ids=None, **kwargs) | 
|  | 243 | +    outputs = self.language_model(input_ids=None, **kwargs) | 
|  | 244 | +    return Qwen3VLModelOutputWithPast(last_hidden_state=outputs.last_hidden_state) | 
| 245 | 245 | 
 | 
| 246 | 246 | 
 | 
| 247 | 247 | def qwen3_vl_model_forward( | 
|  | 
0 commit comments