fix

hiyouga · hiyouga · commit 0bd8e0514d81 · 2025-10-06T21:27:10.000+08:00
diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py
@@ -22,6 +22,7 @@
     Qwen2VLCausalLMOutputWithPast,
     Qwen2VLForConditionalGeneration,
     Qwen2VLModel,
+    Qwen2VLModelOutputWithPast,
 )
 from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
 
@@ -212,7 +213,8 @@ def qwen2_vl_base_forward(
         self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
     )
     kwargs.update(input_kwargs)  # avoid lora module to have multiple keyword arguments
-    return self.language_model(input_ids=None, **kwargs)
+    outputs = self.language_model(input_ids=None, **kwargs)
+    return Qwen2VLModelOutputWithPast(last_hidden_state=outputs.last_hidden_state)
 
 
 def qwen2_vl_model_forward(
diff --git a/verl/models/transformers/qwen3_vl.py b/verl/models/transformers/qwen3_vl.py
@@ -22,6 +22,7 @@
     Qwen3VLCausalLMOutputWithPast,
     Qwen3VLForConditionalGeneration,
     Qwen3VLModel,
+    Qwen3VLModelOutputWithPast,
 )
 from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
 
@@ -203,9 +204,7 @@ def _get_input_embeds(
         deepstack_visual_embeds = deepstack_video_embeds
 
     if pixel_values is None and pixel_values_videos is None:
-        config = model.config.vision_config
-        patch_dim = config.in_channels * config.temporal_patch_size * config.patch_size**2
-        pixel_values = torch.zeros((16, patch_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        pixel_values = torch.zeros((16, 1176), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
         image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
         image_embeds, _ = model.visual(pixel_values, grid_thw=image_grid_thw)
         inputs_embeds += 0.0 * image_embeds.mean()
@@ -241,7 +240,8 @@ def qwen3_vl_base_forward(
         self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
     )
     kwargs.update(input_kwargs)  # avoid lora module to have multiple keyword arguments
-    return self.language_model(input_ids=None, **kwargs)
+    outputs = self.language_model(input_ids=None, **kwargs)
+    return Qwen3VLModelOutputWithPast(last_hidden_state=outputs.last_hidden_state)
 
 
 def qwen3_vl_model_forward(

Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`Qwen2VLCausalLMOutputWithPast,`
`23`	`23`	`Qwen2VLForConditionalGeneration,`
`24`	`24`	`Qwen2VLModel,`
	`25`	`+ Qwen2VLModelOutputWithPast,`
`25`	`26`	`)`
`26`	`27`	`from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor`
`27`	`28`
`@@ -212,7 +213,8 @@ def qwen2_vl_base_forward(`
`212`	`213`	`self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw`
`213`	`214`	`)`
`214`	`215`	`kwargs.update(input_kwargs) # avoid lora module to have multiple keyword arguments`
`215`		`- return self.language_model(input_ids=None, **kwargs)`
	`216`	`+ outputs = self.language_model(input_ids=None, **kwargs)`
	`217`	`+ return Qwen2VLModelOutputWithPast(last_hidden_state=outputs.last_hidden_state)`
`216`	`218`
`217`	`219`
`218`	`220`	`def qwen2_vl_model_forward(`