|
41 | 41 | from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
42 | 42 | from vllm.model_executor.sampling_metadata import SamplingMetadata
|
43 | 43 | from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
|
44 |
| -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalInputs, |
45 |
| - PlaceholderRange, MultiModalDataDict) |
| 44 | +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, |
| 45 | + MultiModalInputs, PlaceholderRange) |
46 | 46 | from vllm.multimodal.parse import ImageProcessorItems
|
47 | 47 | from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
48 | 48 | BaseProcessingInfo)
|
49 |
| -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs |
| 49 | +from vllm.multimodal.profiling import BaseDummyInputsBuilder |
50 | 50 | from vllm.sequence import IntermediateTensors
|
51 | 51 | from vllm.transformers_utils.processor import cached_get_processor
|
52 | 52 |
|
@@ -124,8 +124,9 @@ def replace_linear_class(
|
124 | 124 | @contextmanager
|
125 | 125 | def init_on_device_without_buffers(device: torch.device):
|
126 | 126 | """
|
127 |
| - A context manager under which models are initialized with all parameters on the specified device. |
128 |
| - However buffers are not initialized on specified device. |
| 127 | + A context manager under which models are initialized with all |
| 128 | + parameters on the specified device. However buffers are not |
| 129 | + initialized on specified device. |
129 | 130 |
|
130 | 131 | Args:
|
131 | 132 | device (`torch.device`):
|
@@ -162,8 +163,7 @@ def wrapper(*args, **kwargs):
|
162 | 163 | yield
|
163 | 164 | finally:
|
164 | 165 | nn.Module.register_parameter = old_register_parameter
|
165 |
| - for torch_function_name, old_torch_function in tensor_constructors_to_patch.items( |
166 |
| - ): |
| 166 | + for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(): |
167 | 167 | setattr(torch, torch_function_name, old_torch_function)
|
168 | 168 |
|
169 | 169 |
|
@@ -216,7 +216,7 @@ def get_dummy_mm_data(
|
216 | 216 |
|
217 | 217 | target_width, target_height = self.info.get_max_image_size()
|
218 | 218 |
|
219 |
| - return { |
| 219 | + return { |
220 | 220 | "image":
|
221 | 221 | self._get_dummy_images(width=target_width,
|
222 | 222 | height=target_height,
|
@@ -253,13 +253,11 @@ def _get_mm_fields_config(
|
253 | 253 | hf_processor_mm_kwargs,
|
254 | 254 | num_image_patches: torch.Tensor = None,
|
255 | 255 | ):
|
256 |
| - hf_inputs.pop( |
257 |
| - "attention_mask", |
258 |
| - None) # processors always return a mask but vLLM doesn't need it |
| 256 | + # HF Processors always return a mask but vLLM doesn't need it |
| 257 | + hf_inputs.pop("attention_mask", None) |
259 | 258 | mm_fields = {
|
260 |
| - key: MultiModalFieldConfig.flat_from_sizes("image", |
261 |
| - num_image_patches) |
262 |
| - for key in hf_inputs.keys() |
| 259 | + key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) |
| 260 | + for key in hf_inputs |
263 | 261 | }
|
264 | 262 | mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes(
|
265 | 263 | "image", num_image_patches)
|
@@ -311,13 +309,17 @@ def apply(
|
311 | 309 | """
|
312 | 310 | if return_mm_hashes:
|
313 | 311 | raise ValueError(
|
314 |
| - "TransformersMultimodalLM doesn't support mm hashing yet! Probably you did not set " |
315 |
| - "`disable_mm_preprocessor_cache=True`.") |
| 312 | + "TransformersMultimodalLM doesn't support mm hashing yet! " |
| 313 | + "Probably you did not set `disable_mm_preprocessor_cache=True`") |
316 | 314 |
|
317 | 315 | mm_items = self._to_mm_items(mm_data)
|
318 | 316 | hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
|
319 | 317 |
|
320 |
| - prompt_ids, processed_data, mm_token_type_ids = self._apply_hf_processor_text_mm( |
| 318 | + ( |
| 319 | + prompt_ids, |
| 320 | + processed_data, |
| 321 | + mm_token_type_ids |
| 322 | + ) = self._apply_hf_processor_text_mm( |
321 | 323 | prompt_text=prompt,
|
322 | 324 | mm_items=mm_items,
|
323 | 325 | hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
@@ -435,7 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
435 | 437 | config_override = ConfigOverride(
|
436 | 438 | config, sliding_window=config.interleaved_sliding_window)
|
437 | 439 |
|
438 |
| - # Set correct attn impl and init on "meta" to delay allocating GPU tensors |
| 440 | + # Set correct attn and init on "meta" to delay allocating GPU tensors |
439 | 441 | self.text_config._attn_implementation = "vllm"
|
440 | 442 | with init_on_device_without_buffers("meta"):
|
441 | 443 | # FIXME(Isotr0py): We need to refactor this part in the future to
|
@@ -870,9 +872,9 @@ def get_multimodal_embeddings(self, **kwargs):
|
870 | 872 | if vision_embeddings.ndim == 2:
|
871 | 873 | vision_embeddings = vision_embeddings.unsqueeze(0)
|
872 | 874 |
|
873 |
| - # Embeddings have to be 2D tensors of length `num_images` but transformers |
874 |
| - # returns concat tensors if each patch is of different size. We split it back |
875 |
| - # to make vLLM assertions happy |
| 875 | + # Embeddings have to be 2D tensors of length `num_images` |
| 876 | + # but transformers returns concat tensors if each patch |
| 877 | + # is of different size. We split it back to make vLLM happy |
876 | 878 | vision_embeddings = torch.split(vision_embeddings,
|
877 | 879 | num_image_patches.tolist())
|
878 | 880 | vision_embeddings = [
|
|
0 commit comments