Skip to content

Commit cb63b0e

Browse files
ywang96gemini-code-assist[bot]
authored andcommitted
[Misc] Make MM embedding merge interface explicit in model runner (vllm-project#21147)
Signed-off-by: Roger Wang <hey@rogerw.me> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Himanshu Jaju <hj@mistral.ai>
1 parent c15639a commit cb63b0e

File tree

2 files changed

+8
-10
lines changed

2 files changed

+8
-10
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,11 +1328,10 @@ def execute_model(
13281328
# embeddings), we always use embeddings (rather than token ids)
13291329
# as input to the multimodal model, even when the input is text.
13301330
input_ids = self.input_ids[:num_scheduled_tokens]
1331-
if mm_embeds:
1332-
inputs_embeds = self.model.get_input_embeddings(
1333-
input_ids, mm_embeds)
1334-
else:
1335-
inputs_embeds = self.model.get_input_embeddings(input_ids)
1331+
inputs_embeds = self.model.get_input_embeddings(
1332+
input_ids=input_ids,
1333+
multimodal_embeddings=mm_embeds or None,
1334+
)
13361335
# TODO(woosuk): Avoid the copy. Optimize.
13371336
self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
13381337
inputs_embeds = self.inputs_embeds[:num_input_tokens]

vllm/v1/worker/tpu_model_runner.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -937,11 +937,10 @@ def _get_model_inputs(self, input_ids: torch.Tensor,
937937
# NOTE(woosuk): To unify token ids and soft tokens (vision
938938
# embeddings), we always use embeddings (rather than token ids)
939939
# as input to the multimodal model, even when the input is text.
940-
if mm_embeds:
941-
inputs_embeds = self.model.get_input_embeddings(
942-
input_ids, mm_embeds)
943-
else:
944-
inputs_embeds = self.model.get_input_embeddings(input_ids)
940+
inputs_embeds = self.model.get_input_embeddings(
941+
input_ids=input_ids,
942+
multimodal_embeddings=mm_embeds,
943+
)
945944
return None, inputs_embeds
946945
else:
947946
# For text-only models, we use token ids as input.

0 commit comments

Comments
 (0)