Skip to content

Commit f6037d1

Browse files
wulipc松灵DarkLight1337
authored
[Bugfix] Fix MRoPE Errors in the Qwen-VL Model When Processing Pure Text (#18526)
Co-authored-by: 松灵 <wpf272043@alibaba-inc.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
1 parent fa72f9a commit f6037d1

File tree

1 file changed

+4
-7
lines changed

1 file changed

+4
-7
lines changed

vllm/worker/model_runner.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -729,7 +729,10 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
729729
mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
730730
seq_group_metadata,
731731
range(positions[0], positions[0] + len(positions)))
732-
if not mm_kwargs:
732+
733+
# M-RoPE requires mrope_positions even for plain text; return early
734+
# when mm_kwargs is empty only if inter_data.is_prompt is False.
735+
if not mm_kwargs and not inter_data.is_prompt:
733736
return
734737

735738
inter_data.multi_modal_kwargs = mm_kwargs
@@ -741,12 +744,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
741744
video_grid_thw = mm_kwargs.get("video_grid_thw", None)
742745
audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
743746
None)
744-
assert (
745-
image_grid_thw is not None or video_grid_thw is not None
746-
or audio_feature_lengths is not None), (
747-
"mrope embedding type requires multi-modal input mapper "
748-
"returns 'image_grid_thw' or 'video_grid_thw' or "
749-
"'audio_feature_lengths'.")
750747

751748
second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
752749
use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)

0 commit comments

Comments
 (0)