Skip to content

Commit cfa1998

Browse files
committed
cant compile yet + clean up commented code
Signed-off-by: raushan <raushan@huggingface.co>
1 parent e730323 commit cfa1998

File tree

2 files changed

+6
-9
lines changed

2 files changed

+6
-9
lines changed

vllm/model_executor/models/registry.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,6 @@
185185
"GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501
186186
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
187187
"InternVLChatModel": ("internvl", "InternVLChatModel"),
188-
# "InternVLForConditionalGeneration": ("internvl", "InternVLForConditionalGeneration"), # noqa: E501
189188
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
190189
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
191190
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501

vllm/model_executor/models/transformers.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,10 @@ def get_hf_config(self):
174174
return self.ctx.model_config.hf_config
175175

176176
def get_supported_mm_limits(self):
177-
return {"image": None, "video": None}
177+
return {"image": None}
178178

179179
def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
180-
return {"image": self.get_max_image_tokens(), "video": 0}
180+
return {"image": self.get_max_image_tokens()}
181181

182182
def get_max_image_tokens(self) -> int:
183183
width, height = self.get_max_image_size()
@@ -750,7 +750,6 @@ def load_weights(self, weights: Iterable[tuple[str,
750750
MultiModalProcessor,
751751
info=MultiModalProcessingInfo,
752752
dummy_inputs=MultiModalDummyInputsBuilder)
753-
@support_torch_compile
754753
class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA,
755754
SupportsPP, SupportsMultiModal):
756755
embedding_padding_modules = ["lm_head"]
@@ -857,12 +856,11 @@ def get_multimodal_embeddings(self, **kwargs):
857856
if pixel_values is not None:
858857
if isinstance(pixel_values, torch.Tensor):
859858
pixel_values = pixel_values.flatten(0, 1).to(self.dtype)
860-
if isinstance(num_image_patches, list):
861-
num_image_patches = torch.cat(num_image_patches)
862-
num_image_patches = num_image_patches.flatten()
863859
else:
864860
pixel_values = torch.cat(pixel_values).to(self.dtype)
865-
num_image_patches = torch.cat(num_image_patches).flatten()
861+
862+
if isinstance(num_image_patches, list):
863+
num_image_patches = torch.cat(num_image_patches)
866864

867865
vision_embeddings = self.model.model.get_image_features(
868866
pixel_values,
@@ -880,7 +878,7 @@ def get_multimodal_embeddings(self, **kwargs):
880878
# but transformers returns concat tensors if each patch
881879
# is of different size. We split it back to make vLLM happy
882880
vision_embeddings = torch.split(vision_embeddings,
883-
num_image_patches.tolist())
881+
num_image_patches.flatten().tolist())
884882
vision_embeddings = [
885883
embed.flatten(start_dim=0, end_dim=-2)
886884
for embed in vision_embeddings

0 commit comments

Comments
 (0)