Skip to content

Commit 0bd8e05

Browse files
committed
fix
1 parent 5ddc3b3 commit 0bd8e05

File tree

2 files changed

+7
-5
lines changed

2 files changed

+7
-5
lines changed

verl/models/transformers/qwen2_vl.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
Qwen2VLCausalLMOutputWithPast,
2323
Qwen2VLForConditionalGeneration,
2424
Qwen2VLModel,
25+
Qwen2VLModelOutputWithPast,
2526
)
2627
from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
2728

@@ -212,7 +213,8 @@ def qwen2_vl_base_forward(
212213
self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
213214
)
214215
kwargs.update(input_kwargs) # avoid lora module to have multiple keyword arguments
215-
return self.language_model(input_ids=None, **kwargs)
216+
outputs = self.language_model(input_ids=None, **kwargs)
217+
return Qwen2VLModelOutputWithPast(last_hidden_state=outputs.last_hidden_state)
216218

217219

218220
def qwen2_vl_model_forward(

verl/models/transformers/qwen3_vl.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
Qwen3VLCausalLMOutputWithPast,
2323
Qwen3VLForConditionalGeneration,
2424
Qwen3VLModel,
25+
Qwen3VLModelOutputWithPast,
2526
)
2627
from transformers.models.qwen3_vl.processing_qwen3_vl import Qwen3VLProcessor
2728

@@ -203,9 +204,7 @@ def _get_input_embeds(
203204
deepstack_visual_embeds = deepstack_video_embeds
204205

205206
if pixel_values is None and pixel_values_videos is None:
206-
config = model.config.vision_config
207-
patch_dim = config.in_channels * config.temporal_patch_size * config.patch_size**2
208-
pixel_values = torch.zeros((16, patch_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
207+
pixel_values = torch.zeros((16, 1176), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
209208
image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
210209
image_embeds, _ = model.visual(pixel_values, grid_thw=image_grid_thw)
211210
inputs_embeds += 0.0 * image_embeds.mean()
@@ -241,7 +240,8 @@ def qwen3_vl_base_forward(
241240
self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
242241
)
243242
kwargs.update(input_kwargs) # avoid lora module to have multiple keyword arguments
244-
return self.language_model(input_ids=None, **kwargs)
243+
outputs = self.language_model(input_ids=None, **kwargs)
244+
return Qwen3VLModelOutputWithPast(last_hidden_state=outputs.last_hidden_state)
245245

246246

247247
def qwen3_vl_model_forward(

0 commit comments

Comments
 (0)