cant compile yet + clean up commented code

zucchini-nlp · zucchini-nlp · commit cfa199887757 · 2025-06-04T10:35:20.000+02:00
Signed-off-by: raushan &lt;raushan@huggingface.co&gt;
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -185,7 +185,6 @@
     "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"),  # noqa: E501
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
-    # "InternVLForConditionalGeneration": ("internvl", "InternVLForConditionalGeneration"), # noqa: E501
     "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"),  # noqa: E501
     "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"),  # noqa: E501
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
@@ -174,10 +174,10 @@ def get_hf_config(self):
         return self.ctx.model_config.hf_config
 
     def get_supported_mm_limits(self):
-        return {"image": None, "video": None}
+        return {"image": None}
 
     def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
-        return {"image": self.get_max_image_tokens(), "video": 0}
+        return {"image": self.get_max_image_tokens()}
 
     def get_max_image_tokens(self) -> int:
         width, height = self.get_max_image_size()
@@ -750,7 +750,6 @@ def load_weights(self, weights: Iterable[tuple[str,
     MultiModalProcessor,
     info=MultiModalProcessingInfo,
     dummy_inputs=MultiModalDummyInputsBuilder)
-@support_torch_compile
 class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA,
                                   SupportsPP, SupportsMultiModal):
     embedding_padding_modules = ["lm_head"]
@@ -857,12 +856,11 @@ def get_multimodal_embeddings(self, **kwargs):
         if pixel_values is not None:
             if isinstance(pixel_values, torch.Tensor):
                 pixel_values = pixel_values.flatten(0, 1).to(self.dtype)
-                if isinstance(num_image_patches, list):
-                    num_image_patches = torch.cat(num_image_patches)
-                num_image_patches = num_image_patches.flatten()
             else:
                 pixel_values = torch.cat(pixel_values).to(self.dtype)
-                num_image_patches = torch.cat(num_image_patches).flatten()
+
+            if isinstance(num_image_patches, list):
+                num_image_patches = torch.cat(num_image_patches)
 
             vision_embeddings = self.model.model.get_image_features(
                 pixel_values,
@@ -880,7 +878,7 @@ def get_multimodal_embeddings(self, **kwargs):
                 # but transformers returns concat tensors if each patch
                 # is of different size. We split it back to make vLLM happy
                 vision_embeddings = torch.split(vision_embeddings,
-                                                num_image_patches.tolist())
+                                                num_image_patches.flatten().tolist())
                 vision_embeddings = [
                     embed.flatten(start_dim=0, end_dim=-2)
                     for embed in vision_embeddings