Skip to content

Commit 8160aa9

Browse files
russellbwwl2755-google
authored andcommitted
[Core] More fixes to MultiModalEmbeddings type handling (vllm-project#19715)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
1 parent 4c81525 commit 8160aa9

35 files changed

+71
-36
lines changed

vllm/model_executor/models/aria.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -620,7 +620,8 @@ def get_input_embeddings(
620620
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
621621
) -> torch.Tensor:
622622
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
623-
if multimodal_embeddings is not None:
623+
if multimodal_embeddings is not None \
624+
and len(multimodal_embeddings) != 0:
624625
inputs_embeds = merge_multimodal_embeddings(
625626
input_ids, inputs_embeds, multimodal_embeddings,
626627
self.config.image_token_index)

vllm/model_executor/models/aya_vision.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,8 @@ def get_input_embeddings(
430430
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
431431
) -> torch.Tensor:
432432
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
433-
if multimodal_embeddings is not None:
433+
if multimodal_embeddings is not None \
434+
and len(multimodal_embeddings) != 0:
434435
inputs_embeds = merge_multimodal_embeddings(
435436
input_ids=input_ids,
436437
inputs_embeds=inputs_embeds,

vllm/model_executor/models/blip2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,8 @@ def get_input_embeddings(
641641
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
642642
) -> torch.Tensor:
643643
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
644-
if multimodal_embeddings is not None:
644+
if multimodal_embeddings is not None \
645+
and len(multimodal_embeddings) != 0:
645646
inputs_embeds = merge_multimodal_embeddings(
646647
input_ids, inputs_embeds, multimodal_embeddings,
647648
_IMAGE_TOKEN_ID)

vllm/model_executor/models/chameleon.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,8 @@ def get_input_embeddings(
10051005
) -> torch.Tensor:
10061006

10071007
inputs_embeds = self.model.get_input_embeddings(input_ids)
1008-
if multimodal_embeddings is not None:
1008+
if multimodal_embeddings is not None \
1009+
and len(multimodal_embeddings) != 0:
10091010
inputs_embeds = merge_multimodal_embeddings(
10101011
input_ids, inputs_embeds, multimodal_embeddings,
10111012
self.model.vocabulary_mapping.image_token_id)

vllm/model_executor/models/deepseek_vl2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,8 @@ def get_input_embeddings(
600600
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
601601
) -> torch.Tensor:
602602
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
603-
if multimodal_embeddings is not None:
603+
if multimodal_embeddings is not None \
604+
and len(multimodal_embeddings) != 0:
604605
inputs_embeds = merge_multimodal_embeddings(
605606
input_ids, inputs_embeds, multimodal_embeddings,
606607
self.image_token_id)

vllm/model_executor/models/florence2.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1046,7 +1046,8 @@ def get_input_embeddings(
10461046
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
10471047
) -> torch.Tensor:
10481048
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
1049-
if multimodal_embeddings is not None:
1049+
if multimodal_embeddings is not None \
1050+
and len(multimodal_embeddings) != 0:
10501051
inputs_embeds = merge_multimodal_embeddings(
10511052
input_ids, inputs_embeds, multimodal_embeddings,
10521053
self.pad_token_id)

vllm/model_executor/models/fuyu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,8 @@ def get_input_embeddings(
345345
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
346346
) -> torch.Tensor:
347347
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
348-
if multimodal_embeddings is not None:
348+
if multimodal_embeddings is not None \
349+
and len(multimodal_embeddings) != 0:
349350
inputs_embeds = merge_multimodal_embeddings(
350351
input_ids,
351352
inputs_embeds,

vllm/model_executor/models/gemma3_mm.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -592,7 +592,8 @@ def get_input_embeddings(
592592
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
593593
) -> torch.Tensor:
594594
inputs_embeds = self.language_model.get_input_embeddings(input_ids)
595-
if multimodal_embeddings is not None:
595+
if multimodal_embeddings is not None \
596+
and len(multimodal_embeddings) != 0:
596597
inputs_embeds = merge_multimodal_embeddings(
597598
input_ids,
598599
inputs_embeds,

vllm/model_executor/models/glm4v.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,8 @@ def get_input_embeddings(
609609
) -> torch.Tensor:
610610
inputs_embeds = self.transformer.get_input_embeddings(input_ids)
611611

612-
if multimodal_embeddings is not None:
612+
if multimodal_embeddings is not None \
613+
and len(multimodal_embeddings) != 0:
613614
inputs_embeds = merge_multimodal_embeddings(
614615
input_ids=input_ids,
615616
inputs_embeds=inputs_embeds,

vllm/model_executor/models/granite_speech.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,8 @@ def get_input_embeddings(
721721
multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
722722
) -> torch.Tensor:
723723
"""Compute the merged LLM / audio embeddings."""
724-
if multimodal_embeddings is None:
724+
if multimodal_embeddings is None \
725+
or len(multimodal_embeddings) == 0:
725726
return self.language_model.get_input_embeddings(input_ids)
726727

727728
inputs_embeds = embed_multimodal(

0 commit comments

Comments
 (0)