From 26a9f1b5c177eea869a82b7260ddaf8daa1b3fef Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 19 Feb 2025 16:43:11 +0100 Subject: [PATCH 01/23] tmp --- vllm/engine/llm_engine.py | 2 + vllm/entrypoints/llm.py | 5 ++ vllm/executor/uniproc_executor.py | 1 + vllm/inputs/preprocess.py | 1 + vllm/inputs/registry.py | 2 + vllm/model_executor/models/llama.py | 1 + vllm/model_executor/models/llava.py | 1 + vllm/model_executor/models/transformers.py | 60 +++++++++++++++------- vllm/multimodal/base.py | 1 + vllm/multimodal/processing.py | 24 +++++---- vllm/multimodal/registry.py | 2 + vllm/v1/engine/core.py | 6 ++- vllm/v1/engine/llm_engine.py | 1 + vllm/v1/engine/mm_input_cache.py | 3 ++ vllm/v1/engine/processor.py | 7 ++- vllm/v1/worker/gpu_model_runner.py | 4 ++ vllm/worker/model_runner.py | 18 ++++++- vllm/worker/worker_base.py | 2 + 18 files changed, 109 insertions(+), 32 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2e5bc75c6db..816baab8c0b 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -724,6 +724,7 @@ def add_request( if inputs is not None: prompt = inputs assert prompt is not None and params is not None + print("CALL add_request", prompt) if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -754,6 +755,7 @@ def add_request( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) + print("mm_hashes", preprocessed_inputs.get('mm_hashes')) processed_inputs = self.input_processor(preprocessed_inputs) self._add_processed_request( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 40b7a529ebf..e71ac91e5fc 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -691,6 +691,11 @@ def chat( ] tokenizer = self.get_tokenizer() + model_config = self.llm_engine.processor.input_preprocessor.model_config + mm_processor = self.llm_engine.processor.input_preprocessor.mm_registry.create_processor(model_config, tokenizer) + processor = mm_processor.info.ctx.get_hf_processor() + chat_template = processor.chat_template + model_config = self.llm_engine.get_model_config() resolved_content_format = resolve_chat_template_content_format( chat_template, diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 94db232240d..bf693cab5cc 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -53,6 +53,7 @@ def collective_rpc(self, kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} + # print("self.driver_worker", self.driver_worker, method, getattr(self.driver_worker, method)) answer = run_method(self.driver_worker, method, args, kwargs) return [answer] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index bc5856990da..b2e4866c604 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -317,6 +317,7 @@ def _prompt_to_llm_inputs( * :class:`SingletonInputs` instance """ parsed = parse_singleton_prompt(prompt) + print("CALLED PROCESSOR", parsed["type"]) if parsed["type"] == "str": prompt_text = parsed["content"] diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 87b7a7631e4..d8e16e94569 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -176,6 +176,7 @@ def call_hf_processor( allow_var_kwargs=True, ) + # print("CTX", data.keys(), merged_kwargs.keys()) try: return hf_processor(**data, **merged_kwargs, return_tensors="pt") except Exception as exc: @@ -461,6 +462,7 @@ def process_input(self, model_config: "ModelConfig", processor, ) + print("process_input", processor, mm_processor_kwargs.keys()) processed_inputs = processor( InputContext(model_config), inputs, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2ff52dd7891..a5fd00b670d 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -375,6 +375,7 @@ def forward( "residual": residual }) + # print(hidden_states.shape) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6a4277adb6b..a60e0b83a8f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -530,6 +530,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), ) + print("self.language_model", self.language_model.__class__, self.vision_tower.__class__) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 9b456b24895..f7dfdfa619c 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -38,7 +38,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsQuant +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.inputs import InputRegistry, INPUT_REGISTRY, DummyData +from vll.Sequence import SequenceData +from .interfaces import SupportsQuant, SupportsMultiModal from .utils import maybe_prefix logger = init_logger(__name__) @@ -119,7 +122,20 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ) -class TransformersModel(nn.Module, SupportsQuant): +def map_auto_class(config): + AutoModel + + +def dummy_encoder_data_for_whisper(ctx, seq_len: int, mm_counts): + assert mm_counts["image"] == 1 + return DummyData( + SequenceData.from_prompt_token_counts((0, 596)), + {"image": np.zeros((3, 336, 336))}, + ) + +@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("image", 576) +class TransformersModel(nn.Module, SupportsQuant, SupportsMultiModal): embedding_padding_modules = ["lm_head"] embedding_modules = ["embed_tokens" ] # TODO transformers will have a util to get it @@ -132,12 +148,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: cache_config = vllm_config.cache_config self.config = config - self.vocab_size = config.vocab_size - self.unpadded_vocab_size = config.vocab_size + self.vocab_size = config.get_text_config().vocab_size + self.unpadded_vocab_size = config.get_text_config().vocab_size self.model: PreTrainedModel = AutoModel.from_config( self.config, - attn_implementation="vllm", + attn_implementation={"text_config": "vllm", "vision_config": "eager"}, torch_dtype=vllm_config.model_config.dtype, trust_remote_code=vllm_config.model_config.trust_remote_code, ) @@ -150,39 +166,42 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: tp_size = get_tensor_model_parallel_world_size() self.attention_instances = [ Attention( - num_heads=divide(config.num_attention_heads, tp_size), - head_size=config.head_dim, + num_heads=divide(config.get_text_config().num_attention_heads, tp_size), + head_size=config.get_text_config().head_dim, # NOTE: We use Llama scale as default, if it's set by # Transformers, it's updated in vllm_flash_attention_forward - scale=config.head_dim**-0.5, - num_kv_heads=divide(config.num_key_value_heads, tp_size), + scale=config.get_text_config().head_dim**-0.5, + num_kv_heads=divide(config.get_text_config().num_key_value_heads, tp_size), cache_config=cache_config, quant_config=self.quant_config, - prefix=f"{i}.attn") for i in range(config.num_hidden_layers) + prefix=f"{i}.attn") for i in range(config.get_text_config().num_hidden_layers) ] # Model modifications self.replace_vocab_embed_class(self.model) # ForCausalLM modifications - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, + self.lm_head = ParallelLMHead(config.get_text_config().vocab_size, + config.get_text_config().hidden_size, quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head")) - if config.tie_word_embeddings: + if config.get_text_config().tie_word_embeddings: self.lm_head.weight = self.model.get_input_embeddings().weight logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size, logit_scale) + config.get_text_config().vocab_size, logit_scale) self.sampler = get_sampler() + MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576) + InputRegistry()._dummy_factories_by_model_type[model_cls] = factory + def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): """ Apply the base model tensor parallelization plan to a module. Currently only supports linear layers. """ - if (self.config.base_model_tp_plan is None + if (self.config.get_text_config().base_model_tp_plan is None and get_tensor_model_parallel_world_size() > 1): raise ValueError( "Trying to run tensor parallelization but the model does not " @@ -190,7 +209,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - for pattern, style in self.config.base_model_tp_plan.items(): + for pattern, style in self.config.get_text_config().base_model_tp_plan.items(): if re.match(pattern, qual_name) and isinstance( child_module, nn.Linear): new_module = replace_linear_class(child_module, style, @@ -204,8 +223,8 @@ def replace_vocab_embed_class(self, module: nn.Module): # Use native set input embeddings new_module = VocabParallelEmbedding( self.vocab_size, - self.config.hidden_size, - org_num_embeddings=self.config.vocab_size, + self.config.get_text_config().hidden_size, + org_num_embeddings=self.vocab_size, quant_config=None, ) log_replacement("input embedding", self.model.get_input_embeddings(), @@ -252,7 +271,10 @@ def load_weights(self, weights: Iterable[tuple[str, loaded_params = set[str]() for name, loaded_weight in weights: if name not in params_dict: - name = f"{self.model.base_model_prefix}.{name}" + if "lm_head" in name: + name = name.replace("language_model.", "") + else: + name = f"{self.model.base_model_prefix}.{name}" if name in params_dict: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index c48d07ba365..28abc8b5fe6 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -219,6 +219,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: if not supports_multimodal(model_cls): return 0 + print("_max_mm_tokens", self._max_mm_tokens, model_cls, self.__class__) max_mm_tokens = self._max_mm_tokens.get(model_cls) if max_mm_tokens is None: return 0 diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index fcd02fbd520..268e2dc0b19 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -867,6 +867,7 @@ def _apply_hf_processor_text_mm( mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) + print("prompt_text", prompt_text, processed_data.keys()) processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() @@ -995,6 +996,7 @@ def _cached_apply_hf_processor( _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: + print("NO CACHE!") return self._apply_hf_processor_main( prompt=prompt, mm_items=mm_data_items, @@ -1039,6 +1041,7 @@ def _cached_apply_hf_processor( modality: 0 for modality in mm_missing_data_items } + print("CACHED!", mm_missing_idxs) merged_kw_items = list[MultiModalKwargsItem]() for modality, kw_items in mm_maybe_cached_kw_items.items(): @@ -1232,14 +1235,16 @@ def apply( else: mm_hashes = None - ( - prompt_ids, - mm_kwargs, - is_repl_applied, - ) = self._cached_apply_hf_processor( - prompt, - mm_items, - hf_processor_mm_kwargs, + # prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( + # prompt, + # mm_items, + # hf_processor_mm_kwargs, + # ) + prompt_ids, mm_kwargs, is_repl_applied = self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=True, ) unbound_prompt_repls = self._get_prompt_replacements( @@ -1279,12 +1284,13 @@ def apply( for modality, placeholders in mm_placeholders.items() } + print("DONE HERE?") return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_hashes=mm_hashes, + mm_hashes=None, #mm_hashes, mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 613d1db4167..cc91e9d0279 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -266,6 +266,7 @@ def get_max_tokens_per_item_by_modality( return processor.info.get_mm_max_tokens_per_item( seq_len, mm_limits) + print(self._plugins['image'].get_max_multimodal_tokens(model_config), self._plugins['image'].get_max_multimodal_tokens) return { key: plugin.get_max_multimodal_tokens(model_config) for key, plugin in self._plugins.items() @@ -285,6 +286,7 @@ def get_max_tokens_per_item_by_nonzero_modality( usage of a model. """ mm_limits = self.get_mm_limits_per_prompt(model_config) + print("mm_limits", mm_limits) return { key: max_tokens_per_mm_item diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 66e252b7ccb..5ccbbd32bd4 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -129,9 +129,11 @@ def add_request(self, request: EngineCoreRequest): assert request.mm_inputs is not None request.mm_inputs = self.mm_input_cache_server.get_and_update( request.mm_inputs, request.mm_hashes) + # print("request.mm_hashes is None", request.mm_inputs[0] is None) # V1 ADDED HERE IF CACHED from MMInputMapperServer req = Request.from_engine_core_request(request) + # print("self.scheduler.add_request", self.scheduler.add_request) # vllm.v1.core.scheduler.Scheduler.add_request self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): @@ -150,7 +152,9 @@ def step(self) -> EngineCoreOutputs: return EngineCoreOutputs( outputs=[], scheduler_stats=self.scheduler.make_stats()) - scheduler_output = self.scheduler.schedule() + scheduler_output = self.scheduler.schedule() # kinda allocated new kv cache and updates many internal stats for the requests + # print("scheduler_output", scheduler_output) # DEFI HAS pixel values when V1 is set + # print("self.model_executor", self.model_executor.execute_model) output = self.model_executor.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, output) # type: ignore diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c9a4c5369df..c5d03ace732 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -131,6 +131,7 @@ def add_request( ) -> None: # 1) Process raw inputs into the request. + print("CALL add_request", prompt, self.processor) request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index a1d802bf818..0aaf264ad31 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -100,6 +100,7 @@ def process_inputs( mm_input = self.mm_cache.get(mm_hash) self.mm_cache_total += 1 + # print("mm_input is None", mm_input is None) if mm_input is None: if precomputed_mm_inputs is not None: # Reuse precomputed input (for merged preprocessor) @@ -118,7 +119,9 @@ def process_inputs( else: self.mm_cache_hits += 1 mm_input = None # Avoids sending mm_input to Server + print(" Avoids sending mm_input to Server, use cache somewhow, I dunno how yet") + # print("mm_input is None", mm_input is None) ret_inputs.append(mm_input) return ret_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index b7eee5a3997..1232579b1b4 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -111,6 +111,9 @@ def process_inputs( # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. + # Process inputs. + + # CALL input_preprocessor (preprocess.py) where print(add request) preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -155,6 +158,7 @@ def process_inputs( # Fallback to using MultiModalHasher directly. else: mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) + print('HASHING', bool(decoder_inputs.multi_modal_hashes), mm_hashes) # For merged preprocessor, mm_data is already mm_inputs precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None @@ -187,6 +191,7 @@ def process_inputs( mm_positions, mm_hashes, ) + print("mm_positions", mm_positions) # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple # modalities involved AND the model supports merged input processor. @@ -214,7 +219,7 @@ def process_inputs( mm_hashes=sorted_mm_hashes, mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs=precomputed_mm_inputs, - ) + ) # THIS ONE REMOVES INPUT IMAGES IF CACHED with MMInputMapperClient else: sorted_mm_inputs = None sorted_mm_hashes = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 31fe095a91b..b86099e780e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -813,6 +813,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # depending on the input multimodal items. curr_group_outputs = self.model.get_multimodal_embeddings( **batched_mm_inputs) + print("curr_group_outputs", curr_group_outputs.shape) for output in curr_group_outputs: encoder_outputs.append(output) @@ -881,6 +882,7 @@ def execute_model( # Prepare the decoder inputs. attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -893,6 +895,7 @@ def execute_model( num_input_tokens = num_scheduled_tokens attn_metadata.num_input_tokens = num_input_tokens + print("self.is_multimodal_model", num_scheduled_tokens, num_input_tokens) if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -932,6 +935,7 @@ def execute_model( for k, v in self.intermediate_tensors.items() }) + print("inputs_embeds", inputs_embeds.shape) # THIS IN THE ENTRYPOINT IN V1 # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 67d175c373d..ea05c444efb 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -542,6 +542,7 @@ def _compute_for_prefix_cache_hit( remaining blocks. """ computed_block_nums = inter_data.computed_block_nums + print("cache hit", computed_block_nums is not None, inter_data.is_prompt) # Note that prefix caching does not support sliding window. prefix_cache_hit = (computed_block_nums is not None @@ -651,10 +652,12 @@ def _compute_prompt_adapter_input( # Note that when is_prompt=True, we expect only one sequence # in the group. if not self.enable_prompt_adapter: + print("no enable_prompt_adapter") return prompt_adapter_id = seq_group_metadata.prompt_adapter_id if prompt_adapter_id <= 0 or not inter_data.is_prompt: + print("no prompt_adapter_id", prompt_adapter_id) return # We expect only one sequence in the group when is_prompt=True. @@ -670,6 +673,7 @@ def _compute_prompt_adapter_input( inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * ( query_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs else 1) + print("prompt adapters", inter_data.prompt_adapter_index_mapping, inter_data.prompt_adapter_prompt_mapping) def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata): @@ -683,9 +687,11 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if not mm_data: return + # print("seq_group_metadata", seq_group_metadata, positions) if self.runner.mm_registry.has_processor(self.runner.model_config): mm_kwargs = mm_data else: + print("RUN INPUT MAPPER AGAIN BUT WHY") mm_kwargs = self.multi_modal_input_mapper( mm_data, seq_group_metadata.mm_processor_kwargs, @@ -693,6 +699,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps + print("placeholder_maps", placeholder_maps["image"].src_ranges, placeholder_maps["image"].dest_ranges) # special processing for mrope position deltas. if self.runner.model_config.uses_mrope: @@ -752,12 +759,14 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len=encoder_seq_len) self.inter_data_list.append(inter_data) + # print("input_tokens", inter_data.input_tokens) for seq_idx in range(n_seqs): for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) + per_seq_fn(inter_data, seq_idx, seq_group_metadata) # ADDS PLACEHOLDER HERE I GUESS? for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) + per_seq_group_fn(inter_data, seq_group_metadata) # ADDS MM KWARGS HERE + # print("inter_data should have mm here!!!", inter_data.multi_modal_kwargs is not None) def _use_captured_graph(self, batch_size: int, @@ -977,6 +986,7 @@ def build(self) -> ModelInputForGPU: if data.multi_modal_kwargs is not None ] multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + # print("building", multi_modal_kwargs.keys()) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -1718,6 +1728,10 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() + if "pixel_values" in multi_modal_kwargs: + print('FINALLY FORWARD', model_input.input_tokens.shape, multi_modal_kwargs["pixel_values"].shape) + else: + print('DECODE', model_input.input_tokens.shape) if not bypass_model_exec: with set_forward_context(model_input.attn_metadata, self.vllm_config, virtual_engine): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 190429074d5..8984ee83636 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -343,6 +343,7 @@ def _get_driver_input_and_broadcast( execute_model_req.virtual_engine, execute_model_req.finished_requests_ids)) + # print("self.model_runner.prepare_model_input", self.model_runner.prepare_model_input) kwargs = extract_previous_hidden_states(execute_model_req) if self.do_metadata_broadcast: @@ -417,6 +418,7 @@ def execute_model( orig_model_execute_time = intermediate_tensors.tensors.get( "model_execute_time", torch.tensor(0)).item() + # print("self.model_runner.execute_model", self.model_runner.execute_model) output = self.model_runner.execute_model( model_input=model_input, kv_caches=self.kv_cache[worker_input.virtual_engine] From a502988faf74c4faddd39e4c351017e4463d4da0 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 21 Feb 2025 17:12:08 +0100 Subject: [PATCH 02/23] dump --- vllm/inputs/registry.py | 1 + vllm/model_executor/models/transformers.py | 266 ++++++++++++++++++--- vllm/multimodal/inputs.py | 9 + vllm/multimodal/processing.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 2 +- 5 files changed, 247 insertions(+), 34 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index d8e16e94569..2536924a51d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -388,6 +388,7 @@ def dummy_data_for_profiling( f"Expected at least {num_expected} dummy '{k}' instances " f"for profiling, but found {num_items} instances instead.") + print(dummy_data.multi_modal_data["pixel_values"].shape) return dummy_data def _default_input_processor( diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index f7dfdfa619c..02442eea845 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -19,7 +19,7 @@ import torch from torch import nn -from transformers import AutoModel, PreTrainedModel +from transformers import AutoModel, PreTrainedModel, LlavaConfig from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from vllm.attention import Attention, AttentionMetadata @@ -37,10 +37,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import cached_get_processor +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry, MultiModalKwargs +from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalInputs, PlaceholderRange -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.inputs import InputRegistry, INPUT_REGISTRY, DummyData -from vll.Sequence import SequenceData from .interfaces import SupportsQuant, SupportsMultiModal from .utils import maybe_prefix @@ -122,23 +124,181 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ) -def map_auto_class(config): - AutoModel +class MultiModalProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + # NOTE: this means we don't check if return config type is same as requested + # VLLM on contrary always checks. In whcih cases we can have different config types tho? + return self.ctx.model_config.hf_config + def get_supported_mm_limits(self): + return {"image": None, "video": None} -def dummy_encoder_data_for_whisper(ctx, seq_len: int, mm_counts): - assert mm_counts["image"] == 1 - return DummyData( - SequenceData.from_prompt_token_counts((0, 596)), - {"image": np.zeros((3, 336, 336))}, - ) + def get_mm_max_tokens_per_item(self, seq_len, mm_counts): + return {"image": self.get_max_image_tokens(), "video": 100} + + def get_max_image_tokens(self) -> int: + # Is already an attribute in some VLMs and now reason to make it a required attribute + # TODO: @raushan add it for all VLM configs + return self.get_hf_config().image_seq_length + + def get_hf_processor(self): + processor = cached_get_processor(self.ctx.model_config.model) + return processor + + +class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder): + def get_dummy_processor_inputs( + self, + seq_len, + mm_counts, + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + num_frames = 8 + + processor = self.info.get_hf_processor() + image_token = getattr(processor, "image_token", None) + video_token = getattr(processor, "video_token", None) + + # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer + # max features for model in HF side. IMO should be all done on processor side, not on model config + vision_config = self.info.get_hf_config().vision_config + target_width = target_height = vision_config.image_size + + # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs, + # HF processor will take the modality needed for model and ignore all others + mm_data = { + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images + ), + "video": self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=num_frames, + num_videos=num_videos, + ) + } + + prompt_text = video_token*num_videos if video_token is not None else image_token*num_images + return ProcessorInputs( + prompt_text=prompt_text, + mm_data=mm_data, + ) + + +class MultiModalProcessor(BaseMultiModalProcessor): + def _get_prompt_replacements( + self, + mm_items, + hf_processor_mm_kwargs, + out_mm_kwargs: MultiModalKwargs, + ): + return + + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs, + ): + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + mm_token_type_ids=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), + image_embeds=MultiModalFieldConfig.batched("image"), + video_embeds=MultiModalFieldConfig.batched("video"), + ) + + def _apply_hf_processor_text_mm( + self, + prompt_text, + mm_items, + hf_processor_mm_kwargs, + ): + """ + Apply the HF processor on the prompt text and multi-modal data + together. + + In addition, return whether prompt replacements have been applied. + """ + processor_data, passthrough_data = self._get_hf_mm_data(mm_items) + processor_data["return_mm_token_type_ids"] = True + + processed_data = self._call_hf_processor( + prompt=prompt_text, + mm_data=processor_data, + mm_kwargs=hf_processor_mm_kwargs, + ) + print("prompt_text", prompt_text, processed_data["pixel_values"][0].shape) + processed_data.update(passthrough_data) + + prompt_ids, = processed_data.pop("input_ids").tolist() + mm_token_type_ids = processed_data.pop("mm_token_type_ids") -@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("image", 576) + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), + ) + + return prompt_ids, mm_kwargs, mm_token_type_ids + + def apply( + self, + prompt, + mm_data, + hf_processor_mm_kwargs, + ) -> MultiModalInputs: + """ + Process multi-modal inputs to be used in vLLM. + + Apply HF Processor on prompt text and multi-modal data together, + outputting token IDs and processed tensors. + """ + mm_items = self._to_mm_items(mm_data) + prompt_ids, mm_kwargs, mm_token_type_ids = self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + # HF processor will return `mm_token_type_ids` from which + # we can infer mm_placeholders. Until then hardcode to make code run + # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 + mm_positions = torch.where(mm_token_type_ids == 1)[1] + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + mm_tokens_per_modality = hf_processor._get_num_mm_tokens( + image_inputs=mm_kwargs.get_hf_inputs("image"), + video_inputs=mm_kwargs.get_hf_inputs("video"), + ) + + mm_placeholders = {} + for modality in mm_tokens_per_modality: + split_sizes = mm_tokens_per_modality[modality] + if split_sizes != 0: + chunked_mm_positions = torch.split(mm_positions, split_sizes) + ranges = [ + PlaceholderRange(offset=positions[0].item(), length=positions.shape[0]) + for positions in chunked_mm_positions + ] + mm_placeholders = {modality: ranges} + + return MultiModalInputs( + type="multimodal", + prompt=prompt, + prompt_token_ids=prompt_ids, + mm_kwargs=mm_kwargs, + mm_hashes=None, + mm_placeholders=mm_placeholders, + ) + + +@MULTIMODAL_REGISTRY.register_processor(MultiModalProcessor, + info=MultiModalProcessingInfo, + dummy_inputs=MultiModalDummyInputsBuilder) class TransformersModel(nn.Module, SupportsQuant, SupportsMultiModal): embedding_padding_modules = ["lm_head"] - embedding_modules = ["embed_tokens" - ] # TODO transformers will have a util to get it + embedding_modules = ["embed_tokens"] # TODO transformers will have a util to get it def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() @@ -148,8 +308,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: cache_config = vllm_config.cache_config self.config = config - self.vocab_size = config.get_text_config().vocab_size - self.unpadded_vocab_size = config.get_text_config().vocab_size + self.text_config = config.get_text_config() + self.vocab_size = self.text_config.vocab_size + self.unpadded_vocab_size = self.text_config.vocab_size self.model: PreTrainedModel = AutoModel.from_config( self.config, @@ -166,31 +327,31 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: tp_size = get_tensor_model_parallel_world_size() self.attention_instances = [ Attention( - num_heads=divide(config.get_text_config().num_attention_heads, tp_size), - head_size=config.get_text_config().head_dim, + num_heads=divide(self.text_config.num_attention_heads, tp_size), + head_size=self.text_config.head_dim, # NOTE: We use Llama scale as default, if it's set by # Transformers, it's updated in vllm_flash_attention_forward - scale=config.get_text_config().head_dim**-0.5, - num_kv_heads=divide(config.get_text_config().num_key_value_heads, tp_size), + scale=self.text_config.head_dim**-0.5, + num_kv_heads=divide(self.text_config.num_key_value_heads, tp_size), cache_config=cache_config, quant_config=self.quant_config, - prefix=f"{i}.attn") for i in range(config.get_text_config().num_hidden_layers) + prefix=f"{i}.attn") for i in range(self.text_config.num_hidden_layers) ] # Model modifications self.replace_vocab_embed_class(self.model) # ForCausalLM modifications - self.lm_head = ParallelLMHead(config.get_text_config().vocab_size, - config.get_text_config().hidden_size, + self.lm_head = ParallelLMHead(self.text_config.vocab_size, + self.text_config.hidden_size, quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head")) - if config.get_text_config().tie_word_embeddings: + if self.text_config.tie_word_embeddings: self.lm_head.weight = self.model.get_input_embeddings().weight logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.get_text_config().vocab_size, logit_scale) + self.vocab_size, logit_scale) self.sampler = get_sampler() MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576) @@ -201,7 +362,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): Apply the base model tensor parallelization plan to a module. Currently only supports linear layers. """ - if (self.config.get_text_config().base_model_tp_plan is None + if (self.text_config.base_model_tp_plan is None and get_tensor_model_parallel_world_size() > 1): raise ValueError( "Trying to run tensor parallelization but the model does not " @@ -209,7 +370,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - for pattern, style in self.config.get_text_config().base_model_tp_plan.items(): + for pattern, style in self.text_config.base_model_tp_plan.items(): if re.match(pattern, qual_name) and isinstance( child_module, nn.Linear): new_module = replace_linear_class(child_module, style, @@ -223,7 +384,7 @@ def replace_vocab_embed_class(self, module: nn.Module): # Use native set input embeddings new_module = VocabParallelEmbedding( self.vocab_size, - self.config.get_text_config().hidden_size, + self.text_config.hidden_size, org_num_embeddings=self.vocab_size, quant_config=None, ) @@ -241,7 +402,8 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model( - input_ids[None, ...], + input_ids[None, ...] if input_ids is not None else None, + inputs_embeds=inputs_embeds[None, ...] if inputs_embeds is not None else None, use_cache=False, position_ids=positions[None, ...], attn_metadata=attn_metadata, @@ -271,6 +433,8 @@ def load_weights(self, weights: Iterable[tuple[str, loaded_params = set[str]() for name, loaded_weight in weights: if name not in params_dict: + # In MLLM the head is usually part of the LM so we might want to strip it + # Very bad workaround, needs smth better if "lm_head" in name: name = name.replace("language_model.", "") else: @@ -282,3 +446,43 @@ def load_weights(self, weights: Iterable[tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + def get_multimodal_embeddings(self, **kwargs): + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + vision_embeddings = self.model.get_image_features( + # Thing about pixels being batched again, adding extra dim + # TODO: find out do we really need that extra dim + pixel_values.flatten(0, 1), + vision_feature_layer=self.config.vision_feature_layer, + vision_feature_select_strategy=self.config.vision_feature_select_strategy, + ) + return vision_embeddings + + if image_embeds is not None: + return image_embeds + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings()(input_ids) + if multimodal_embeddings is not None: + # most supported VLMs merge like this, otherwise we can add a special + # `merge_multimodal_embeddings` method on HF side + mask = (input_ids == self.config.image_token_index) + mask = mask.unsqueeze(-1).expand_as(inputs_embeds) + multimodal_embeddings = torch.cat(multimodal_embeddings) + + # FIXME: The returned multimodal_embeddings must be either a 3D torch.Tensor of shape + # (num_items, feature_size, hidden_size), or a list / tuple of 2D torch.Tensor’s of shape + # (feature_size, hidden_size), so that multimodal_embeddings[i] retrieves the embeddings generated + # from the i-th multimodal data item (e.g, image) of the request. + inputs_embeds = inputs_embeds.masked_scatter(mask, multimodal_embeddings) + return inputs_embeds \ No newline at end of file diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index e93fa24a6e4..3d12f01fb6f 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -702,6 +702,15 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: self._validate_modality("get_items", modality) return self._items_by_modality[modality] + def get_hf_inputs(self, modality: str) -> dict[str, NestedTensors]: + modality_items = self._items_by_modality.get(modality, None) + hf_inputs = defaultdict[str, list[NestedTensors]](list) + if modality_items is not None: + for mm_kwargs_item in modality_items: + for key, value in mm_kwargs_item.items(): + hf_inputs[key].append(value.data) + hf_inputs = {key: torch.stack(value) for key, value in hf_inputs.items()} + return hf_inputs MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 268e2dc0b19..c0b2b597f52 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -867,7 +867,6 @@ def _apply_hf_processor_text_mm( mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) - print("prompt_text", prompt_text, processed_data.keys()) processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() @@ -1284,7 +1283,7 @@ def apply( for modality, placeholders in mm_placeholders.items() } - print("DONE HERE?") + print("DONE HERE?", mm_placeholder_ranges) return MultiModalInputs( type="multimodal", prompt=prompt, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b86099e780e..4a07ef32a4b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -813,7 +813,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # depending on the input multimodal items. curr_group_outputs = self.model.get_multimodal_embeddings( **batched_mm_inputs) - print("curr_group_outputs", curr_group_outputs.shape) + print("curr_group_outputs", curr_group_outputs[0].shape) for output in curr_group_outputs: encoder_outputs.append(output) From e0b534beb59405f76282e3910dca93d39930c450 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Feb 2025 10:02:45 +0100 Subject: [PATCH 03/23] clean up --- vllm/inputs/preprocess.py | 1 - vllm/inputs/registry.py | 3 --- vllm/model_executor/models/llama.py | 1 - vllm/model_executor/models/llava.py | 1 - vllm/model_executor/models/transformers.py | 1 - vllm/multimodal/base.py | 1 - vllm/multimodal/processing.py | 19 +++++-------------- vllm/multimodal/registry.py | 2 -- vllm/v1/engine/core.py | 6 +----- vllm/v1/engine/llm_engine.py | 1 - vllm/v1/engine/mm_input_cache.py | 3 --- vllm/v1/engine/processor.py | 4 ---- vllm/v1/worker/gpu_model_runner.py | 3 --- vllm/worker/model_runner.py | 18 ++---------------- 14 files changed, 8 insertions(+), 56 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b2e4866c604..bc5856990da 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -317,7 +317,6 @@ def _prompt_to_llm_inputs( * :class:`SingletonInputs` instance """ parsed = parse_singleton_prompt(prompt) - print("CALLED PROCESSOR", parsed["type"]) if parsed["type"] == "str": prompt_text = parsed["content"] diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 2536924a51d..87b7a7631e4 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -176,7 +176,6 @@ def call_hf_processor( allow_var_kwargs=True, ) - # print("CTX", data.keys(), merged_kwargs.keys()) try: return hf_processor(**data, **merged_kwargs, return_tensors="pt") except Exception as exc: @@ -388,7 +387,6 @@ def dummy_data_for_profiling( f"Expected at least {num_expected} dummy '{k}' instances " f"for profiling, but found {num_items} instances instead.") - print(dummy_data.multi_modal_data["pixel_values"].shape) return dummy_data def _default_input_processor( @@ -463,7 +461,6 @@ def process_input(self, model_config: "ModelConfig", processor, ) - print("process_input", processor, mm_processor_kwargs.keys()) processed_inputs = processor( InputContext(model_config), inputs, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index a5fd00b670d..2ff52dd7891 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -375,7 +375,6 @@ def forward( "residual": residual }) - # print(hidden_states.shape) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index a60e0b83a8f..6a4277adb6b 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -530,7 +530,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), ) - print("self.language_model", self.language_model.__class__, self.vision_tower.__class__) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 02442eea845..59a2a1e69b4 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -230,7 +230,6 @@ def _apply_hf_processor_text_mm( mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) - print("prompt_text", prompt_text, processed_data["pixel_values"][0].shape) processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 28abc8b5fe6..c48d07ba365 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -219,7 +219,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: if not supports_multimodal(model_cls): return 0 - print("_max_mm_tokens", self._max_mm_tokens, model_cls, self.__class__) max_mm_tokens = self._max_mm_tokens.get(model_cls) if max_mm_tokens is None: return 0 diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index c0b2b597f52..3415beece53 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -995,7 +995,6 @@ def _cached_apply_hf_processor( _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: - print("NO CACHE!") return self._apply_hf_processor_main( prompt=prompt, mm_items=mm_data_items, @@ -1040,7 +1039,6 @@ def _cached_apply_hf_processor( modality: 0 for modality in mm_missing_data_items } - print("CACHED!", mm_missing_idxs) merged_kw_items = list[MultiModalKwargsItem]() for modality, kw_items in mm_maybe_cached_kw_items.items(): @@ -1234,16 +1232,10 @@ def apply( else: mm_hashes = None - # prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( - # prompt, - # mm_items, - # hf_processor_mm_kwargs, - # ) - prompt_ids, mm_kwargs, is_repl_applied = self._apply_hf_processor_main( - prompt=prompt, - mm_items=mm_items, - hf_processor_mm_kwargs=hf_processor_mm_kwargs, - enable_hf_prompt_replacement=True, + prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( + prompt, + mm_items, + hf_processor_mm_kwargs, ) unbound_prompt_repls = self._get_prompt_replacements( @@ -1283,13 +1275,12 @@ def apply( for modality, placeholders in mm_placeholders.items() } - print("DONE HERE?", mm_placeholder_ranges) return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_hashes=None, #mm_hashes, + mm_hashes=mm_hashes, mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index cc91e9d0279..613d1db4167 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -266,7 +266,6 @@ def get_max_tokens_per_item_by_modality( return processor.info.get_mm_max_tokens_per_item( seq_len, mm_limits) - print(self._plugins['image'].get_max_multimodal_tokens(model_config), self._plugins['image'].get_max_multimodal_tokens) return { key: plugin.get_max_multimodal_tokens(model_config) for key, plugin in self._plugins.items() @@ -286,7 +285,6 @@ def get_max_tokens_per_item_by_nonzero_modality( usage of a model. """ mm_limits = self.get_mm_limits_per_prompt(model_config) - print("mm_limits", mm_limits) return { key: max_tokens_per_mm_item diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5ccbbd32bd4..66e252b7ccb 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -129,11 +129,9 @@ def add_request(self, request: EngineCoreRequest): assert request.mm_inputs is not None request.mm_inputs = self.mm_input_cache_server.get_and_update( request.mm_inputs, request.mm_hashes) - # print("request.mm_hashes is None", request.mm_inputs[0] is None) # V1 ADDED HERE IF CACHED from MMInputMapperServer req = Request.from_engine_core_request(request) - # print("self.scheduler.add_request", self.scheduler.add_request) # vllm.v1.core.scheduler.Scheduler.add_request self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): @@ -152,9 +150,7 @@ def step(self) -> EngineCoreOutputs: return EngineCoreOutputs( outputs=[], scheduler_stats=self.scheduler.make_stats()) - scheduler_output = self.scheduler.schedule() # kinda allocated new kv cache and updates many internal stats for the requests - # print("scheduler_output", scheduler_output) # DEFI HAS pixel values when V1 is set - # print("self.model_executor", self.model_executor.execute_model) + scheduler_output = self.scheduler.schedule() output = self.model_executor.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, output) # type: ignore diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c5d03ace732..c9a4c5369df 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -131,7 +131,6 @@ def add_request( ) -> None: # 1) Process raw inputs into the request. - print("CALL add_request", prompt, self.processor) request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 0aaf264ad31..a1d802bf818 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -100,7 +100,6 @@ def process_inputs( mm_input = self.mm_cache.get(mm_hash) self.mm_cache_total += 1 - # print("mm_input is None", mm_input is None) if mm_input is None: if precomputed_mm_inputs is not None: # Reuse precomputed input (for merged preprocessor) @@ -119,9 +118,7 @@ def process_inputs( else: self.mm_cache_hits += 1 mm_input = None # Avoids sending mm_input to Server - print(" Avoids sending mm_input to Server, use cache somewhow, I dunno how yet") - # print("mm_input is None", mm_input is None) ret_inputs.append(mm_input) return ret_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 1232579b1b4..908204adf72 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -112,8 +112,6 @@ def process_inputs( # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. # Process inputs. - - # CALL input_preprocessor (preprocess.py) where print(add request) preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -158,7 +156,6 @@ def process_inputs( # Fallback to using MultiModalHasher directly. else: mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) - print('HASHING', bool(decoder_inputs.multi_modal_hashes), mm_hashes) # For merged preprocessor, mm_data is already mm_inputs precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None @@ -191,7 +188,6 @@ def process_inputs( mm_positions, mm_hashes, ) - print("mm_positions", mm_positions) # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple # modalities involved AND the model supports merged input processor. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4a07ef32a4b..16ec44eefac 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -813,7 +813,6 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # depending on the input multimodal items. curr_group_outputs = self.model.get_multimodal_embeddings( **batched_mm_inputs) - print("curr_group_outputs", curr_group_outputs[0].shape) for output in curr_group_outputs: encoder_outputs.append(output) @@ -895,7 +894,6 @@ def execute_model( num_input_tokens = num_scheduled_tokens attn_metadata.num_input_tokens = num_input_tokens - print("self.is_multimodal_model", num_scheduled_tokens, num_input_tokens) if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -935,7 +933,6 @@ def execute_model( for k, v in self.intermediate_tensors.items() }) - print("inputs_embeds", inputs_embeds.shape) # THIS IN THE ENTRYPOINT IN V1 # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ea05c444efb..67d175c373d 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -542,7 +542,6 @@ def _compute_for_prefix_cache_hit( remaining blocks. """ computed_block_nums = inter_data.computed_block_nums - print("cache hit", computed_block_nums is not None, inter_data.is_prompt) # Note that prefix caching does not support sliding window. prefix_cache_hit = (computed_block_nums is not None @@ -652,12 +651,10 @@ def _compute_prompt_adapter_input( # Note that when is_prompt=True, we expect only one sequence # in the group. if not self.enable_prompt_adapter: - print("no enable_prompt_adapter") return prompt_adapter_id = seq_group_metadata.prompt_adapter_id if prompt_adapter_id <= 0 or not inter_data.is_prompt: - print("no prompt_adapter_id", prompt_adapter_id) return # We expect only one sequence in the group when is_prompt=True. @@ -673,7 +670,6 @@ def _compute_prompt_adapter_input( inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * ( query_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs else 1) - print("prompt adapters", inter_data.prompt_adapter_index_mapping, inter_data.prompt_adapter_prompt_mapping) def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata): @@ -687,11 +683,9 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if not mm_data: return - # print("seq_group_metadata", seq_group_metadata, positions) if self.runner.mm_registry.has_processor(self.runner.model_config): mm_kwargs = mm_data else: - print("RUN INPUT MAPPER AGAIN BUT WHY") mm_kwargs = self.multi_modal_input_mapper( mm_data, seq_group_metadata.mm_processor_kwargs, @@ -699,7 +693,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps - print("placeholder_maps", placeholder_maps["image"].src_ranges, placeholder_maps["image"].dest_ranges) # special processing for mrope position deltas. if self.runner.model_config.uses_mrope: @@ -759,14 +752,12 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len=encoder_seq_len) self.inter_data_list.append(inter_data) - # print("input_tokens", inter_data.input_tokens) for seq_idx in range(n_seqs): for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) # ADDS PLACEHOLDER HERE I GUESS? + per_seq_fn(inter_data, seq_idx, seq_group_metadata) for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) # ADDS MM KWARGS HERE - # print("inter_data should have mm here!!!", inter_data.multi_modal_kwargs is not None) + per_seq_group_fn(inter_data, seq_group_metadata) def _use_captured_graph(self, batch_size: int, @@ -986,7 +977,6 @@ def build(self) -> ModelInputForGPU: if data.multi_modal_kwargs is not None ] multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - # print("building", multi_modal_kwargs.keys()) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -1728,10 +1718,6 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() - if "pixel_values" in multi_modal_kwargs: - print('FINALLY FORWARD', model_input.input_tokens.shape, multi_modal_kwargs["pixel_values"].shape) - else: - print('DECODE', model_input.input_tokens.shape) if not bypass_model_exec: with set_forward_context(model_input.attn_metadata, self.vllm_config, virtual_engine): From 7e8f0d8a0ed0d17696b9a5628915b7dbe3041814 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Feb 2025 10:06:16 +0100 Subject: [PATCH 04/23] clean up 2 --- vllm/engine/llm_engine.py | 2 -- vllm/entrypoints/llm.py | 4 ---- vllm/executor/uniproc_executor.py | 1 - vllm/multimodal/processing.py | 6 +++++- vllm/v1/engine/processor.py | 3 +-- vllm/v1/worker/gpu_model_runner.py | 1 - vllm/worker/worker_base.py | 2 -- 7 files changed, 6 insertions(+), 13 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 816baab8c0b..2e5bc75c6db 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -724,7 +724,6 @@ def add_request( if inputs is not None: prompt = inputs assert prompt is not None and params is not None - print("CALL add_request", prompt) if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -755,7 +754,6 @@ def add_request( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - print("mm_hashes", preprocessed_inputs.get('mm_hashes')) processed_inputs = self.input_processor(preprocessed_inputs) self._add_processed_request( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e71ac91e5fc..075ef3e59d8 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -691,10 +691,6 @@ def chat( ] tokenizer = self.get_tokenizer() - model_config = self.llm_engine.processor.input_preprocessor.model_config - mm_processor = self.llm_engine.processor.input_preprocessor.mm_registry.create_processor(model_config, tokenizer) - processor = mm_processor.info.ctx.get_hf_processor() - chat_template = processor.chat_template model_config = self.llm_engine.get_model_config() resolved_content_format = resolve_chat_template_content_format( diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index bf693cab5cc..94db232240d 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -53,7 +53,6 @@ def collective_rpc(self, kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} - # print("self.driver_worker", self.driver_worker, method, getattr(self.driver_worker, method)) answer = run_method(self.driver_worker, method, args, kwargs) return [answer] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3415beece53..fcd02fbd520 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1232,7 +1232,11 @@ def apply( else: mm_hashes = None - prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( + ( + prompt_ids, + mm_kwargs, + is_repl_applied, + ) = self._cached_apply_hf_processor( prompt, mm_items, hf_processor_mm_kwargs, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 908204adf72..b7eee5a3997 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -111,7 +111,6 @@ def process_inputs( # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. - # Process inputs. preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -215,7 +214,7 @@ def process_inputs( mm_hashes=sorted_mm_hashes, mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs=precomputed_mm_inputs, - ) # THIS ONE REMOVES INPUT IMAGES IF CACHED with MMInputMapperClient + ) else: sorted_mm_inputs = None sorted_mm_hashes = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 16ec44eefac..31fe095a91b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -881,7 +881,6 @@ def execute_model( # Prepare the decoder inputs. attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) - num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 8984ee83636..190429074d5 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -343,7 +343,6 @@ def _get_driver_input_and_broadcast( execute_model_req.virtual_engine, execute_model_req.finished_requests_ids)) - # print("self.model_runner.prepare_model_input", self.model_runner.prepare_model_input) kwargs = extract_previous_hidden_states(execute_model_req) if self.do_metadata_broadcast: @@ -418,7 +417,6 @@ def execute_model( orig_model_execute_time = intermediate_tensors.tensors.get( "model_execute_time", torch.tensor(0)).item() - # print("self.model_runner.execute_model", self.model_runner.execute_model) output = self.model_runner.execute_model( model_input=model_input, kv_caches=self.kv_cache[worker_input.virtual_engine] From 57c2d85cfae0dcd32c9de46806846e4552cf38d6 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Feb 2025 10:28:24 +0100 Subject: [PATCH 05/23] use arbitrary high resolution in dummy inputs --- vllm/model_executor/models/transformers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 59a2a1e69b4..b1a6fa0d95a 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -161,9 +161,9 @@ def get_dummy_processor_inputs( video_token = getattr(processor, "video_token", None) # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer - # max features for model in HF side. IMO should be all done on processor side, not on model config - vision_config = self.info.get_hf_config().vision_config - target_width = target_height = vision_config.image_size + # max features for model in HF side. But imo we can just set a veru high resolution + # and the processor will return us pixels with correct max shape. Resolution 3kx3k is high enough + target_width = target_height = 3000 # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs, # HF processor will take the modality needed for model and ignore all others @@ -353,9 +353,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.vocab_size, logit_scale) self.sampler = get_sampler() - MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576) - InputRegistry()._dummy_factories_by_model_type[model_cls] = factory - def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): """ Apply the base model tensor parallelization plan to a module. From de54bbfbb640d65b2bd39dbeea31d7c5b86c1900 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Mar 2025 16:41:55 +0100 Subject: [PATCH 06/23] tmp --- vllm/model_executor/models/molmo.py | 6 +++++- vllm/model_executor/models/transformers.py | 4 +++- vllm/multimodal/profiling.py | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index b2154ef54af..6e9bda1e075 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1626,8 +1626,9 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: return None image_features = self._process_image_input(image_input) + print(image_features.shape) - return [ + out = [ self._get_mm_embeds(*args) for args in zip( image_features, image_input["feat_is_patch"], @@ -1635,6 +1636,8 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input["embed_is_patch"], ) ] + print(len(out[0]), [o.shape for o in out[0]]) + return out def get_input_embeddings( self, @@ -1646,6 +1649,7 @@ def get_input_embeddings( assert self.img_patch_id is not None # Extract the patch tokens scattered in _get_mm_embeds + print(len(multimodal_embeddings), multimodal_embeddings[0][0].shape) patch_embeddings = json_map_leaves( lambda x: x[~x.isnan()].view(-1, *x.shape[1:]), cast(JSONTree[torch.Tensor], multimodal_embeddings), diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index b1a6fa0d95a..78f21b39b9d 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -204,9 +204,10 @@ def _get_mm_fields_config( ): return dict( pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), mm_token_type_ids=MultiModalFieldConfig.batched("image"), pixel_values_videos=MultiModalFieldConfig.batched("video"), - image_embeds=MultiModalFieldConfig.batched("image"), video_embeds=MultiModalFieldConfig.batched("video"), ) @@ -282,6 +283,7 @@ def apply( ] mm_placeholders = {modality: ranges} + print(mm_placeholders) return MultiModalInputs( type="multimodal", prompt=prompt, diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 81c92b38f8e..5b5817a7b10 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -157,6 +157,7 @@ def get_dummy_data( info = self.processing_info mm_max_tokens_per_item = info.get_mm_max_tokens_per_item( seq_len, mm_counts) + print(mm_max_tokens_per_item) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( From 4b4f8b7e9a8a6a4f6d943ab9b21da736c0b5021b Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 9 Apr 2025 20:27:54 +0200 Subject: [PATCH 07/23] still ugly but works with latest processor update --- vllm/model_executor/model_loader/utils.py | 6 +- vllm/model_executor/models/registry.py | 4 +- vllm/model_executor/models/transformers.py | 89 +++++++++++++--------- vllm/v1/worker/gpu_model_runner.py | 3 + 4 files changed, 62 insertions(+), 40 deletions(-) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 15f37aad6d8..1265b921822 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -42,7 +42,7 @@ def is_transformers_impl_compatible( def resolve_transformers_arch(model_config: ModelConfig, architectures: list[str]): for i, arch in enumerate(architectures): - if arch == "TransformersForCausalLM": + if arch in ["TransformersForCausalLM", "TransformersForMultimodalLM"]: continue auto_map: dict[str, str] = getattr(model_config.hf_config, "auto_map", None) or dict() @@ -66,7 +66,7 @@ def resolve_transformers_arch(model_config: ModelConfig, raise ValueError( f"The Transformers implementation of {arch} is not " "compatible with vLLM.") - architectures[i] = "TransformersForCausalLM" + architectures[i] = "TransformersForMultimodalLM" if model_config.model_impl == ModelImpl.AUTO: if not is_transformers_impl_compatible(arch, custom_model_module): raise ValueError( @@ -77,7 +77,7 @@ def resolve_transformers_arch(model_config: ModelConfig, "%s has no vLLM implementation, falling back to Transformers " "implementation. Some features may not be supported and " "performance may not be optimal.", arch) - architectures[i] = "TransformersForCausalLM" + architectures[i] = "TransformersForMultimodalLM" return architectures diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 43ff892349e..cf619344ea9 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -211,6 +211,7 @@ _TRANSFORMERS_MODELS = { "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), + "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), } # yapf: enable @@ -324,6 +325,7 @@ def _try_load_model_cls( ) -> Optional[Type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) + model.load_model_cls() try: return model.load_model_cls() except Exception: @@ -436,7 +438,7 @@ def _normalize_archs( # make sure Transformers backend is put at the last as a fallback if len(normalized_arch) != len(architectures): - normalized_arch.append("TransformersForCausalLM") + normalized_arch.extend(["TransformersForCausalLM", "TransformersForMultimodalLM"]) return normalized_arch def inspect_model_cls( diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 2528f543c42..3b5f32dd4df 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -126,7 +126,7 @@ def get_supported_mm_limits(self): return {"image": None, "video": None} def get_mm_max_tokens_per_item(self, seq_len, mm_counts): - return {"image": self.get_max_image_tokens(), "video": 100} + return {"image": self.get_max_image_tokens(), "video": 0} def get_max_image_tokens(self) -> int: # Is already an attribute in some VLMs and now reason to make it a required attribute @@ -181,13 +181,26 @@ def get_dummy_processor_inputs( class MultiModalProcessor(BaseMultiModalProcessor): - def _get_prompt_replacements( + def _get_prompt_updates( self, mm_items, hf_processor_mm_kwargs, - out_mm_kwargs: MultiModalKwargs, + out_mm_kwargs, ): - return + """ + Given the original multi-modal items for this modality + and HF-processed data, output the updates to perform. + + The information returned by this method is used to update token inputs + which bypass the HF processor. It is also used to update the output of + HF processor if the HF process does not apply prompt updates to text + inputs. + + Moreover, this information is critical to determine the token positions + in order to construct :class:`~vllm-multimodal.input.PlaceholderRange` + for each multi-modal item. + """ + return None def _get_mm_fields_config( self, @@ -240,6 +253,7 @@ def apply( prompt, mm_data, hf_processor_mm_kwargs, + return_mm_hashes = False, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -247,6 +261,12 @@ def apply( Apply HF Processor on prompt text and multi-modal data together, outputting token IDs and processed tensors. """ + if return_mm_hashes: + raise ValueError( + "TransformersMultimodalLM doesn't support mm hashing yet! Probably you did not set " + "`enable_prefix_caching=False` and `enable_chunked_prefill=False`." + ) + mm_items = self._to_mm_items(mm_data) prompt_ids, mm_kwargs, mm_token_type_ids = self._apply_hf_processor_text_mm( prompt_text=prompt, @@ -269,13 +289,14 @@ def apply( split_sizes = mm_tokens_per_modality[modality] if split_sizes != 0: chunked_mm_positions = torch.split(mm_positions, split_sizes) + mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()] + is_embed = (mm_tokens == hf_processor.image_token_id).bool() ranges = [ - PlaceholderRange(offset=positions[0].item(), length=positions.shape[0]) + PlaceholderRange(offset=positions[0].item(), length=positions.shape[0], is_embed=is_embed) for positions in chunked_mm_positions ] mm_placeholders = {modality: ranges} - print(mm_placeholders) return MultiModalInputs( type="multimodal", prompt=prompt, @@ -319,7 +340,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # weights mapper to rename weights. self.model: PreTrainedModel = AutoModel.from_config( config, - attn_implementation={"text_config": "vllm", "vision_config": "sdpa"}, + attn_implementation={"text_config": "vllm", "vision_config": "eager"}, torch_dtype=model_config.dtype, trust_remote_code=model_config.trust_remote_code, ) @@ -341,12 +362,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Attention layers self.attention_instances = self.create_attention_instances() + # Move meta tensors to device (should happen last) + self.meta_to_empty(self.model) + # Initialize buffers (e.g. rotary embedding inverse frequency) self.init_buffers(self.model) - # Move remaining meta tensors to device (should happen last) - self.meta_to_empty(self.model) - self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory(["hidden_states"], text_config.hidden_size)) @@ -470,15 +491,19 @@ def init_buffers(self, module: nn.Module): - This class is constructed using a `PretrainedConfig` """ for name, buffer in module.named_buffers(recurse=False): - if buffer.device == torch.device("meta"): - new_buffer = getattr(type(module)(self.config), name) - setattr(module, name, new_buffer) + if module.__class__.__name__.startswith("Pixtral") or module.__class__.__name__.startswith("CLIP"): + config = self.config.vision_config + else: + config = self.config.text_config + new_buffer = getattr(type(module)(config), name) + setattr(module, name, new_buffer) for child in module.children(): self.init_buffers(child) def meta_to_empty(self, module: nn.Module): + names = [name for name, _ in module.named_buffers()] + [name for name, _ in module.named_parameters()] tensors = list(chain(module.buffers(), module.parameters())) - if tensors and all(t.device == torch.device("meta") for t in tensors): + if tensors and any(t.device == torch.device("meta") for t in tensors): module.to_empty(device=self.device_config.device) return # We can stop recursing because to_empty is recursive for child in module.children(): @@ -675,11 +700,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @property def hf_to_vllm_mapper(self): prefix_mapper = { - name: "model." + name - for name, _ in self.model.model.named_children() + "language_model.model": "model.language_model", + "vision_tower": "model.vision_tower", + "multi_modal_projector": "model.multi_modal_projector", + "language_model.lm_head": "lm_head", } return WeightsMapper( - orig_to_new_substr={"model.": "model.model."}, orig_to_new_prefix=prefix_mapper, ) @@ -711,23 +737,12 @@ def sample(self, logits: torch.Tensor, def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - params_dict = dict(self.named_parameters()) - loaded_params = set[str]() - for name, loaded_weight in weights: - if name not in params_dict: - # In MLLM the head is usually part of the LM so we might want to strip it - # Very bad workaround, needs smth better - if "lm_head" in name: - name = name.replace("language_model.", "") - else: - name = f"{self.model.base_model_prefix}.{name}" - if name in params_dict: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - loaded_params.add(name) - return loaded_params + loader = AutoWeightsLoader( + self, + skip_prefixes=(["lm_head."] + if self.config.get_text_config().tie_word_embeddings else None), + ) + return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_multimodal_embeddings(self, **kwargs): pixel_values = kwargs.pop("pixel_values", None) @@ -737,12 +752,14 @@ def get_multimodal_embeddings(self, **kwargs): return None if pixel_values is not None: - vision_embeddings = self.model.get_image_features( + pixel_values = pixel_values.to(torch.float16) + vision_embeddings = self.model.model.get_image_features( # Thing about pixels being batched again, adding extra dim # TODO: find out do we really need that extra dim pixel_values.flatten(0, 1), vision_feature_layer=self.config.vision_feature_layer, vision_feature_select_strategy=self.config.vision_feature_select_strategy, + **{k: v.flatten(0, 1) for k, v in kwargs.items()}, ) return vision_embeddings @@ -754,7 +771,7 @@ def get_input_embeddings( input_ids: torch.Tensor, multimodal_embeddings = None, ) -> torch.Tensor: - inputs_embeds = self.model.get_input_embeddings()(input_ids) + inputs_embeds = self.model.model.get_input_embeddings()(input_ids) if multimodal_embeddings is not None: # most supported VLMs merge like this, otherwise we can add a special # `merge_multimodal_embeddings` method on HF side diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a83409a72a8..4b51c694d8b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -893,6 +893,7 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): output, is_embed=pos_info.is_embed, ) + print('execute encoder', self.encoder_cache[req_id][input_id].shape) def _gather_mm_embeddings( self, @@ -932,6 +933,8 @@ def _gather_mm_embeddings( if (is_embed := pos_info.is_embed) is not None: is_embed = is_embed[start_idx:end_idx] + + print(start_idx, end_idx, num_encoder_tokens, num_scheduled_tokens, pos_info.is_embed.shape) mm_embeds_item = gather_mm_placeholders( encoder_output[start_idx:end_idx], From c5aac3efdb993aa6e62bc67c590ed6afefc26a34 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 21 May 2025 13:24:26 +0200 Subject: [PATCH 08/23] update --- vllm/config.py | 4 +- vllm/model_executor/models/registry.py | 8 +- vllm/model_executor/models/transformers.py | 221 +++++++++++++-------- vllm/multimodal/profiling.py | 12 +- vllm/v1/worker/gpu_model_runner.py | 3 - 5 files changed, 150 insertions(+), 98 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 439e27b154a..1a20910541c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -398,7 +398,7 @@ def __init__( # only the attention layer itself is aware of the sliding # window, and use the window size to compute the attention. self.hf_text_config.interleaved_sliding_window = sliding_window - delattr(self.hf_text_config, "sliding_window") + # delattr(self.hf_text_config, "sliding_window") sliding_window = None self.max_model_len = _get_and_verify_max_len( @@ -865,7 +865,7 @@ def get_head_size(self) -> int: if self.is_attention_free: return 0 - if hasattr(self.hf_text_config, "head_dim"): + if hasattr(self.hf_text_config, "head_dim") and self.hf_text_config.head_dim is not None: return self.hf_text_config.head_dim # FIXME(woosuk): This may not be true for all models. return (self.hf_text_config.hidden_size // diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index cf619344ea9..a34ed16d927 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -174,6 +174,7 @@ "GLM4VForCausalLM": ("glm4v", "GLM4VForCausalLM"), "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), + # "InternVLForConditionalGeneration": ("internvl", "InternVLForConditionalGeneration"), # noqa: E501 "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 @@ -210,8 +211,8 @@ } _TRANSFORMERS_MODELS = { - "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), + "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), } # yapf: enable @@ -438,7 +439,9 @@ def _normalize_archs( # make sure Transformers backend is put at the last as a fallback if len(normalized_arch) != len(architectures): - normalized_arch.extend(["TransformersForCausalLM", "TransformersForMultimodalLM"]) + # The order matters. If causal comes first, checks on MM model fails because it is not registered in MultimodalRegistry + # TODO: needs help from vLLM team + normalized_arch.extend(["TransformersForMultimodalLM", "TransformersForCausalLM"]) return normalized_arch def inspect_model_cls( @@ -446,7 +449,6 @@ def inspect_model_cls( architectures: Union[str, List[str]], ) -> Tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) - for arch in architectures: model_info = self._try_inspect_model_cls(arch) if model_info is not None: diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 3b5f32dd4df..655feef7c1b 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -16,6 +16,7 @@ """Wrapper around `transformers` models""" import re from itertools import chain +from contextlib import contextmanager from typing import Iterable, Literal, Optional, Union import torch @@ -46,6 +47,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalInputs, PlaceholderRange +from vllm.multimodal.parse import ImageProcessorItems, ImageSize from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant, SupportsMultiModal from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, @@ -116,10 +118,51 @@ def replace_linear_class( ) +# Copied from `accelerate` +@contextmanager +def init_on_device_without_buffers(device: torch.device): + """ + A context manager under which models are initialized with all parameters on the specified device. + However buffers are not initalized on specified device. + + Args: + device (`torch.device`): + Device to initialize all parameters on. + """ + + old_register_parameter = nn.Module.register_parameter + + def register_empty_parameter(module, name, param): + old_register_parameter(module, name, param) + if param is not None: + param_cls = type(module._parameters[name]) + kwargs = module._parameters[name].__dict__ + kwargs["requires_grad"] = param.requires_grad + module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) + + tensor_constructors_to_patch = {} + + def patch_tensor_constructor(fn): + def wrapper(*args, **kwargs): + kwargs["device"] = device + return fn(*args, **kwargs) + return wrapper + + try: + nn.Module.register_parameter = register_empty_parameter + for torch_function_name in tensor_constructors_to_patch.keys(): + setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name))) + yield + finally: + nn.Module.register_parameter = old_register_parameter + for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(): + setattr(torch, torch_function_name, old_torch_function) + + class MultiModalProcessingInfo(BaseProcessingInfo): def get_hf_config(self): # NOTE: this means we don't check if return config type is same as requested - # VLLM on contrary always checks. In whcih cases we can have different config types tho? + # vLLM on contrary always checks. In which cases we can have different config types tho? return self.ctx.model_config.hf_config def get_supported_mm_limits(self): @@ -129,14 +172,20 @@ def get_mm_max_tokens_per_item(self, seq_len, mm_counts): return {"image": self.get_max_image_tokens(), "video": 0} def get_max_image_tokens(self) -> int: - # Is already an attribute in some VLMs and now reason to make it a required attribute - # TODO: @raushan add it for all VLM configs - return self.get_hf_config().image_seq_length + width, height = self.get_max_image_size() + processor = self.get_hf_processor() + mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {} + mm_tokens = processor._get_num_multimodal_tokens(image_sizes=([height, width],), **mm_processor_kwargs) + image_tokens = mm_tokens["num_image_tokens"][0] + return image_tokens def get_hf_processor(self): processor = cached_get_processor(self.ctx.model_config.model) return processor + def get_max_image_size(self): + return 10_000, 10_000 # hardcode for arbitrary very large size + class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder): def get_dummy_processor_inputs( @@ -151,11 +200,7 @@ def get_dummy_processor_inputs( processor = self.info.get_hf_processor() image_token = getattr(processor, "image_token", None) video_token = getattr(processor, "video_token", None) - - # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer - # max features for model in HF side. But imo we can just set a veru high resolution - # and the processor will return us pixels with correct max shape. Resolution 3kx3k is high enough - target_width = target_height = 3000 + target_width, target_height = self.info.get_max_image_size() # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs, # HF processor will take the modality needed for model and ignore all others @@ -163,7 +208,7 @@ def get_dummy_processor_inputs( "image": self._get_dummy_images( width=target_width, height=target_height, - num_images=num_images + num_images=1 ), "video": self._get_dummy_videos( width=target_width, @@ -173,7 +218,7 @@ def get_dummy_processor_inputs( ) } - prompt_text = video_token*num_videos if video_token is not None else image_token*num_images + prompt_text = video_token*num_videos if num_videos else image_token*num_images return ProcessorInputs( prompt_text=prompt_text, mm_data=mm_data, @@ -206,15 +251,26 @@ def _get_mm_fields_config( self, hf_inputs, hf_processor_mm_kwargs, + num_image_patches: torch.Tensor = None, ): - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - mm_token_type_ids=MultiModalFieldConfig.batched("image"), - pixel_values_videos=MultiModalFieldConfig.batched("video"), - video_embeds=MultiModalFieldConfig.batched("video"), - ) + # NOTE from `transformers`: we are planning on refactoring image processors to return same format as output + # The final solution would be either to converge as Idefics-style or used a nested tensor + if False and "image_grid_thw" in hf_inputs: # Qwen-style model + image_grid_thw = hf_inputs["image_grid_thw"] + image_grid_sizes = image_grid_thw.prod(-1) + mm_fields = { + "image_grid_thw" : MultiModalFieldConfig.batched("image"), + "image_embeds": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + "pixel_values": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), + } + else: + hf_inputs.pop("attention_mask", None) + size_per_item = num_image_patches if num_image_patches is not None else torch.tensor([1] * len(hf_inputs["pixel_values"])) + mm_fields = {key: MultiModalFieldConfig.flat_from_sizes("image", size_per_item) for key in hf_inputs.keys() if "video" not in key} + mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes("image", size_per_item) + # mm_fields = {key: MultiModalFieldConfig.batched("image") for key in hf_inputs.keys() if "video" not in key} + # mm_fields["image_embeds"] = MultiModalFieldConfig.batched("image") + return mm_fields def _apply_hf_processor_text_mm( self, @@ -238,15 +294,11 @@ def _apply_hf_processor_text_mm( ) processed_data.update(passthrough_data) + print("prompt_text", prompt_text, processed_data['input_ids'].shape, processed_data["mm_token_type_ids"].shape, processed_data['pixel_values'].shape) prompt_ids, = processed_data.pop("input_ids").tolist() mm_token_type_ids = processed_data.pop("mm_token_type_ids") - mm_kwargs = MultiModalKwargs.from_hf_inputs( - processed_data, - self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), - ) - - return prompt_ids, mm_kwargs, mm_token_type_ids + return prompt_ids, processed_data, mm_token_type_ids def apply( self, @@ -264,11 +316,13 @@ def apply( if return_mm_hashes: raise ValueError( "TransformersMultimodalLM doesn't support mm hashing yet! Probably you did not set " - "`enable_prefix_caching=False` and `enable_chunked_prefill=False`." + "`disable_mm_preprocessor_cache=True`." ) mm_items = self._to_mm_items(mm_data) - prompt_ids, mm_kwargs, mm_token_type_ids = self._apply_hf_processor_text_mm( + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + + prompt_ids, processed_data, mm_token_type_ids = self._apply_hf_processor_text_mm( prompt_text=prompt, mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, @@ -278,24 +332,34 @@ def apply( # we can infer mm_placeholders. Until then hardcode to make code run # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 mm_positions = torch.where(mm_token_type_ids == 1)[1] - hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - mm_tokens_per_modality = hf_processor._get_num_mm_tokens( - image_inputs=mm_kwargs.get_hf_inputs("image"), - video_inputs=mm_kwargs.get_hf_inputs("video"), - ) + images = mm_items.get_items("image", ImageProcessorItems) + mm_processor_kwargs = self.info.ctx.model_config.mm_processor_kwargs or {} + image_sizes = [] + for item_idx in range(len(images)): + image_size = images.get_image_size(item_idx) + image_sizes.append((image_size.height, image_size.width)) + + mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(image_sizes=image_sizes, **mm_processor_kwargs) + print("num_tokens", mm_token_type_ids.shape, mm_positions.shape, image_sizes, mm_tokens_per_modality, mm_processor_kwargs.keys()) mm_placeholders = {} - for modality in mm_tokens_per_modality: - split_sizes = mm_tokens_per_modality[modality] - if split_sizes != 0: - chunked_mm_positions = torch.split(mm_positions, split_sizes) - mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()] - is_embed = (mm_tokens == hf_processor.image_token_id).bool() - ranges = [ - PlaceholderRange(offset=positions[0].item(), length=positions.shape[0], is_embed=is_embed) - for positions in chunked_mm_positions - ] - mm_placeholders = {modality: ranges} + split_sizes = mm_tokens_per_modality["num_image_tokens"] + if split_sizes: + chunked_mm_positions = torch.split(mm_positions, split_sizes) + mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()] + chunked_mm_tokens = torch.split(mm_tokens, split_sizes) + print("Is embed", (mm_tokens == hf_processor.image_token_id).sum(-1)) + ranges = [ + PlaceholderRange(offset=positions[0].item(), length=positions.shape[0], is_embed=(mm_tokens == hf_processor.image_token_id).bool()) + for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens) + ] + mm_placeholders = {"image": ranges} + + num_image_patches = torch.tensor(mm_tokens_per_modality["num_image_patches"]) if "num_image_patches" in mm_tokens_per_modality else None + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, num_image_patches), + ) return MultiModalInputs( type="multimodal", @@ -334,7 +398,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.tp_size = get_tensor_model_parallel_world_size() # Use meta device to delay allocating GPU tensors - with torch.device("meta"): + with init_on_device_without_buffers("meta"): # FIXME(Isotr0py): We need to refactor this part in the future to # avoid registering an extra model layer, otherwise we will need a # weights mapper to rename weights. @@ -365,9 +429,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Move meta tensors to device (should happen last) self.meta_to_empty(self.model) - # Initialize buffers (e.g. rotary embedding inverse frequency) - self.init_buffers(self.model) - self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory(["hidden_states"], text_config.hidden_size)) @@ -476,36 +537,12 @@ def create_attention_instances(self) -> dict[int, Attention]: for i in range(start, end) } - def init_buffers(self, module: nn.Module): - """ - If a `buffer` is on the `meta` device, then its parent - `module` is the original module created by: - - ```python - with torch.device("meta"): - self.model: PreTrainedModel = AutoModel.from_config(...) - ``` - - This means that: - - `type(module)` is a class from `transformers` - - This class is constructed using a `PretrainedConfig` - """ - for name, buffer in module.named_buffers(recurse=False): - if module.__class__.__name__.startswith("Pixtral") or module.__class__.__name__.startswith("CLIP"): - config = self.config.vision_config - else: - config = self.config.text_config - new_buffer = getattr(type(module)(config), name) - setattr(module, name, new_buffer) - for child in module.children(): - self.init_buffers(child) - def meta_to_empty(self, module: nn.Module): - names = [name for name, _ in module.named_buffers()] + [name for name, _ in module.named_parameters()] - tensors = list(chain(module.buffers(), module.parameters())) - if tensors and any(t.device == torch.device("meta") for t in tensors): - module.to_empty(device=self.device_config.device) - return # We can stop recursing because to_empty is recursive + for name, param in module.named_parameters(recurse=False): + if param.device == torch.device("meta"): + new_param = torch.empty_like(param, device=self.device_config.device) + new_param = type(param)(new_param) + module._parameters[name] = new_param for child in module.children(): self.meta_to_empty(child) @@ -529,11 +566,16 @@ def forward( if inputs_embeds is not None: inputs_embeds = inputs_embeds[None, ...] + if self.model_config.uses_mrope: + positions = positions[:, None] + else: + positions = positions[None, ...] + hidden_states = self.model( input_ids=input_ids, inputs_embeds=inputs_embeds, use_cache=False, - position_ids=positions[None, ...], + position_ids=positions, attention_instances=self.attention_instances, return_dict=False)[0][0, ...] # we remove batch dimension for now @@ -666,6 +708,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): quant_config: QuantizationConfig = vllm_config.quant_config self.config = config + self.dtype = vllm_config.model_config.dtype self.model = TransformersModel(vllm_config=vllm_config, prefix=prefix) text_config = config.get_text_config() @@ -694,17 +737,24 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) - # FIXME(Isotr0py): Don't use any weights mapper for Transformers backend, - # this makes thing complicated. We need to remove this mapper after refactor - # `TransformersModel` in the future. @property def hf_to_vllm_mapper(self): prefix_mapper = { "language_model.model": "model.language_model", + "text_model.model": "model.text_model", "vision_tower": "model.vision_tower", + "image_newline": "model.image_newline", + "vqmodel": "model.vqmodel", + "vision_model": "model.vision_model", "multi_modal_projector": "model.multi_modal_projector", + "text_model.lm_head": "lm_head", "language_model.lm_head": "lm_head", } + # Don't change the order for QwenVL + if 'Qwen2' in self.config.__class__.__name__: + prefix_mapper["model"] = "model.language_model" + prefix_mapper["visual"] = "model.visual" + return WeightsMapper( orig_to_new_prefix=prefix_mapper, ) @@ -752,15 +802,17 @@ def get_multimodal_embeddings(self, **kwargs): return None if pixel_values is not None: - pixel_values = pixel_values.to(torch.float16) + pixel_values = pixel_values.to(self.dtype) vision_embeddings = self.model.model.get_image_features( # Thing about pixels being batched again, adding extra dim # TODO: find out do we really need that extra dim pixel_values.flatten(0, 1), - vision_feature_layer=self.config.vision_feature_layer, - vision_feature_select_strategy=self.config.vision_feature_select_strategy, **{k: v.flatten(0, 1) for k, v in kwargs.items()}, ) + if isinstance(vision_embeddings, torch.Tensor): + print("vision_embeddings", vision_embeddings.shape, pixel_values.shape) + # TODO: fix pixtral to return output of shape [bs, seq-len, dim] + # vision_embeddings = vision_embeddings.reshape(pixel_values.shape[0], -1, 5120) return vision_embeddings if image_embeds is not None: @@ -775,7 +827,7 @@ def get_input_embeddings( if multimodal_embeddings is not None: # most supported VLMs merge like this, otherwise we can add a special # `merge_multimodal_embeddings` method on HF side - mask = (input_ids == self.config.image_token_index) + mask = (input_ids == self.config.image_token_id) mask = mask.unsqueeze(-1).expand_as(inputs_embeds) multimodal_embeddings = torch.cat(multimodal_embeddings) @@ -783,5 +835,6 @@ def get_input_embeddings( # (num_items, feature_size, hidden_size), or a list / tuple of 2D torch.Tensor’s of shape # (feature_size, hidden_size), so that multimodal_embeddings[i] retrieves the embeddings generated # from the i-th multimodal data item (e.g, image) of the request. + print(mask[..., 0].sum(-1), multimodal_embeddings.shape) inputs_embeds = inputs_embeds.masked_scatter(mask, multimodal_embeddings) return inputs_embeds \ No newline at end of file diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 4f418d18c8c..78c81adb125 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -189,12 +189,12 @@ def get_and_validate_mm_inputs( modality: mm_max_tokens_per_item[modality] * mm_counts[modality] for modality in placeholders_by_modality } - if total_placeholders_by_modality != expected_placeholders_by_modality: - raise AssertionError( - f"The processed dummy data has a total of " - f"{total_placeholders_by_modality} placeholder tokens, which " - f"is not the expected {expected_placeholders_by_modality} " - "tokens.") + # if total_placeholders_by_modality != expected_placeholders_by_modality: + # raise AssertionError( + # f"The processed dummy data has a total of " + # f"{total_placeholders_by_modality} placeholder tokens, which " + # f"is not the expected {expected_placeholders_by_modality} " + # "tokens.") return mm_inputs, total_placeholders_by_modality def get_encoder_dummy_data( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4b51c694d8b..a83409a72a8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -893,7 +893,6 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"): output, is_embed=pos_info.is_embed, ) - print('execute encoder', self.encoder_cache[req_id][input_id].shape) def _gather_mm_embeddings( self, @@ -933,8 +932,6 @@ def _gather_mm_embeddings( if (is_embed := pos_info.is_embed) is not None: is_embed = is_embed[start_idx:end_idx] - - print(start_idx, end_idx, num_encoder_tokens, num_scheduled_tokens, pos_info.is_embed.shape) mm_embeds_item = gather_mm_placeholders( encoder_output[start_idx:end_idx], From 60300c47809be208e2cb76bdd7de14bcfbb41182 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 21 May 2025 16:08:50 +0200 Subject: [PATCH 09/23] fix issues --- vllm/model_executor/models/transformers.py | 12 ++---------- vllm/v1/engine/mm_input_cache.py | 3 +-- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5e787e47930..b7b105e2d34 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -218,7 +218,7 @@ def get_dummy_processor_inputs( ) } - prompt_text = video_token*num_videos if num_videos else image_token*num_images + prompt_text = image_token*num_images return ProcessorInputs( prompt_text=prompt_text, mm_data=mm_data, @@ -544,7 +544,7 @@ def meta_to_empty(self, module: nn.Module): new_param = type(param)(new_param) module._parameters[name] = new_param for child in module.children(): - self.init_parameters(child) + self.meta_to_empty(child) def get_input_embeddings(self) -> nn.Module: return self.model.get_input_embeddings() @@ -725,8 +725,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): else: self.lm_head = PPMissingLayer() - self.sampler = get_sampler() - self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) @@ -772,12 +770,6 @@ def compute_logits( sampling_metadata) return logits - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: - - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index fcb90bebdb6..3631f4d59fa 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -34,8 +34,7 @@ class MirroredProcessingCache: def __init__(self, model_config): mm_config = model_config.multimodal_config - disable_mm_preprocessor_cache = mm_config is not None and \ - not mm_config.disable_mm_preprocessor_cache + disable_mm_preprocessor_cache = mm_config is not None and mm_config.disable_mm_preprocessor_cache self.use_cache = not disable_mm_preprocessor_cache self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs) From 0c69adebe90586cf1e3d9b86ce2ba755c85c5b96 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 29 May 2025 13:50:50 +0200 Subject: [PATCH 10/23] update --- vllm/config.py | 2 +- vllm/model_executor/models/registry.py | 1 - vllm/model_executor/models/transformers.py | 90 +++++++++------------- 3 files changed, 38 insertions(+), 55 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b510419d7e2..3fa1db0e839 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -562,7 +562,7 @@ def __post_init__(self) -> None: # only the attention layer itself is aware of the sliding # window, and use the window size to compute the attention. self.hf_text_config.interleaved_sliding_window = sliding_window - # delattr(self.hf_text_config, "sliding_window") + delattr(self.hf_text_config, "sliding_window") sliding_window = None self.max_model_len = _get_and_verify_max_len( diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index fd70ca6f501..037a85a9478 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -345,7 +345,6 @@ def _try_load_model_cls( ) -> Optional[type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) - model.load_model_cls() try: return model.load_model_cls() except Exception: diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index b7b105e2d34..f8d300880f6 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -22,7 +22,7 @@ import torch from torch import nn -from transformers import AutoModel, PretrainedConfig, PreTrainedModel, LlavaConfig +from transformers import AutoModel, PretrainedConfig, PreTrainedModel from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from vllm.attention import Attention @@ -161,8 +161,6 @@ def wrapper(*args, **kwargs): class MultiModalProcessingInfo(BaseProcessingInfo): def get_hf_config(self): - # NOTE: this means we don't check if return config type is same as requested - # vLLM on contrary always checks. In which cases we can have different config types tho? return self.ctx.model_config.hf_config def get_supported_mm_limits(self): @@ -194,28 +192,20 @@ def get_dummy_processor_inputs( mm_counts, ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - num_frames = 8 processor = self.info.get_hf_processor() - image_token = getattr(processor, "image_token", None) - video_token = getattr(processor, "video_token", None) + if "gemma3" in processor.__class__.__name__.lower(): + image_token = getattr(processor, "boi_token") + else: + image_token = getattr(processor, "image_token", "") target_width, target_height = self.info.get_max_image_size() - # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs, - # HF processor will take the modality needed for model and ignore all others mm_data = { "image": self._get_dummy_images( width=target_width, height=target_height, num_images=1 ), - "video": self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=num_frames, - num_videos=num_videos, - ) } prompt_text = image_token*num_images @@ -253,25 +243,12 @@ def _get_mm_fields_config( hf_processor_mm_kwargs, num_image_patches: torch.Tensor = None, ): - # NOTE from `transformers`: we are planning on refactoring image processors to return same format as output - # The final solution would be either to converge as Idefics-style or used a nested tensor - if False and "image_grid_thw" in hf_inputs: # Qwen-style model - image_grid_thw = hf_inputs["image_grid_thw"] - image_grid_sizes = image_grid_thw.prod(-1) - mm_fields = { - "image_grid_thw" : MultiModalFieldConfig.batched("image"), - "image_embeds": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), - "pixel_values": MultiModalFieldConfig.flat_from_sizes("image", image_grid_sizes), - } - else: - hf_inputs.pop("attention_mask", None) - size_per_item = num_image_patches if num_image_patches is not None else torch.tensor([1] * len(hf_inputs["pixel_values"])) - mm_fields = {key: MultiModalFieldConfig.flat_from_sizes("image", size_per_item) for key in hf_inputs.keys() if "video" not in key} - mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes("image", size_per_item) - # mm_fields = {key: MultiModalFieldConfig.batched("image") for key in hf_inputs.keys() if "video" not in key} - # mm_fields["image_embeds"] = MultiModalFieldConfig.batched("image") + hf_inputs.pop("attention_mask", None) # processors always return a mask but vLLM doesn't need it + mm_fields = {key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) for key in hf_inputs.keys()} + mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) + mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image") return mm_fields - + def _apply_hf_processor_text_mm( self, prompt_text, @@ -294,9 +271,8 @@ def _apply_hf_processor_text_mm( ) processed_data.update(passthrough_data) - print("prompt_text", prompt_text, processed_data['input_ids'].shape, processed_data["mm_token_type_ids"].shape, processed_data['pixel_values'].shape) prompt_ids, = processed_data.pop("input_ids").tolist() - mm_token_type_ids = processed_data.pop("mm_token_type_ids") + mm_token_type_ids = processed_data.pop("mm_token_type_ids") if "mm_token_type_ids" in processed_data else processed_data.pop("token_type_ids") # for gemma3 only return prompt_ids, processed_data, mm_token_type_ids @@ -340,7 +316,6 @@ def apply( image_sizes.append((image_size.height, image_size.width)) mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(image_sizes=image_sizes, **mm_processor_kwargs) - print("num_tokens", mm_token_type_ids.shape, mm_positions.shape, image_sizes, mm_tokens_per_modality, mm_processor_kwargs.keys()) mm_placeholders = {} split_sizes = mm_tokens_per_modality["num_image_tokens"] @@ -348,7 +323,6 @@ def apply( chunked_mm_positions = torch.split(mm_positions, split_sizes) mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()] chunked_mm_tokens = torch.split(mm_tokens, split_sizes) - print("Is embed", (mm_tokens == hf_processor.image_token_id).sum(-1)) ranges = [ PlaceholderRange(offset=positions[0].item(), length=positions.shape[0], is_embed=(mm_tokens == hf_processor.image_token_id).bool()) for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens) @@ -356,6 +330,7 @@ def apply( mm_placeholders = {"image": ranges} num_image_patches = torch.tensor(mm_tokens_per_modality["num_image_patches"]) if "num_image_patches" in mm_tokens_per_modality else None + processed_data['num_image_patches'] = num_image_patches mm_kwargs = MultiModalKwargs.from_hf_inputs( processed_data, self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, num_image_patches), @@ -734,9 +709,10 @@ def hf_to_vllm_mapper(self): "language_model.model": "model.language_model", "text_model.model": "model.text_model", "vision_tower": "model.vision_tower", - "image_newline": "model.image_newline", "vqmodel": "model.vqmodel", "vision_model": "model.vision_model", + "vision_embed_tokens": "model.vision_embed_tokens", + "image_newline": "model.image_newline", "multi_modal_projector": "model.multi_modal_projector", "text_model.lm_head": "lm_head", "language_model.lm_head": "lm_head", @@ -781,23 +757,38 @@ def load_weights(self, weights: Iterable[tuple[str, def get_multimodal_embeddings(self, **kwargs): pixel_values = kwargs.pop("pixel_values", None) + pixel_values = pixel_values if pixel_values is not None else kwargs.pop("image_patches", None) image_embeds = kwargs.pop("image_embeds", None) + num_image_patches = kwargs.pop("num_image_patches") if pixel_values is None and image_embeds is None: return None if pixel_values is not None: - pixel_values = pixel_values.to(self.dtype) + if isinstance(pixel_values, torch.Tensor): + pixel_values = pixel_values.flatten(0, 1).to(self.dtype) + if isinstance(num_image_patches, list): + num_image_patches = torch.cat(num_image_patches) + num_image_patches = num_image_patches.flatten() + else: + pixel_values = torch.cat(pixel_values).to(self.dtype) + num_image_patches = torch.cat(num_image_patches).flatten() + vision_embeddings = self.model.model.get_image_features( - # Thing about pixels being batched again, adding extra dim - # TODO: find out do we really need that extra dim - pixel_values.flatten(0, 1), + pixel_values, **{k: v.flatten(0, 1) for k, v in kwargs.items()}, - ) + ) + if isinstance(vision_embeddings, torch.Tensor): - print("vision_embeddings", vision_embeddings.shape, pixel_values.shape) - # TODO: fix pixtral to return output of shape [bs, seq-len, dim] - # vision_embeddings = vision_embeddings.reshape(pixel_values.shape[0], -1, 5120) + if vision_embeddings.ndim == 2: + vision_embeddings = vision_embeddings.unsqueeze(0) + + # Embeddings have to be 2D tensors of length `num_images` but transformers + # returns concat tensors if each patch is of different size. We split it back + # to make vLLM assertions happy + vision_embeddings = torch.split(vision_embeddings, num_image_patches.tolist()) + vision_embeddings = [embed.flatten(start_dim=0, end_dim=-2) for embed in vision_embeddings] + return vision_embeddings if image_embeds is not None: @@ -810,16 +801,9 @@ def get_input_embeddings( ) -> torch.Tensor: inputs_embeds = self.model.model.get_input_embeddings()(input_ids) if multimodal_embeddings is not None: - # most supported VLMs merge like this, otherwise we can add a special - # `merge_multimodal_embeddings` method on HF side mask = (input_ids == self.config.image_token_id) mask = mask.unsqueeze(-1).expand_as(inputs_embeds) multimodal_embeddings = torch.cat(multimodal_embeddings) - # FIXME: The returned multimodal_embeddings must be either a 3D torch.Tensor of shape - # (num_items, feature_size, hidden_size), or a list / tuple of 2D torch.Tensor’s of shape - # (feature_size, hidden_size), so that multimodal_embeddings[i] retrieves the embeddings generated - # from the i-th multimodal data item (e.g, image) of the request. - print(mask[..., 0].sum(-1), multimodal_embeddings.shape) inputs_embeds = inputs_embeds.masked_scatter(mask, multimodal_embeddings) return inputs_embeds \ No newline at end of file From d36ab67c4a767c09bf0241dcf6eb873b937190b9 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 29 May 2025 14:23:34 +0200 Subject: [PATCH 11/23] style --- requirements/test.txt | 22 ++- vllm/model_executor/models/registry.py | 3 +- vllm/model_executor/models/transformers.py | 163 +++++++++++++-------- vllm/multimodal/inputs.py | 11 +- 4 files changed, 133 insertions(+), 66 deletions(-) diff --git a/requirements/test.txt b/requirements/test.txt index 89d47701734..df377085602 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -27,6 +27,10 @@ argcomplete==3.5.1 # via datamodel-code-generator arrow==1.3.0 # via isoduration +async-timeout==5.0.1 + # via + # aiohttp + # redis attrs==24.2.0 # via # aiohttp @@ -129,6 +133,11 @@ eval-type-backport==0.2.2 # via mteb evaluate==0.4.3 # via lm-eval +exceptiongroup==1.3.0 + # via + # anyio + # hypothesis + # pytest fastparquet==2024.11.0 # via genai-perf fastrlock==0.8.2 @@ -640,7 +649,6 @@ setuptools==77.0.3 # via # mamba-ssm # pytablewriter - # torch # triton shellingham==1.5.4 # via typer @@ -700,8 +708,13 @@ tokenizers==0.21.1 # via # -r requirements/test.in # transformers +toml==0.10.2 + # via datamodel-code-generator tomli==2.2.1 - # via schemathesis + # via + # black + # pytest + # schemathesis tomli-w==1.2.0 # via schemathesis torch==2.7.0+cu128 @@ -775,13 +788,18 @@ types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 # via + # anyio + # black + # exceptiongroup # huggingface-hub # librosa # mistral-common # mteb + # multidict # pqdm # pydantic # pydantic-core + # rich # torch # typer tzdata==2024.2 diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 80c3b37497e..2241f0bab55 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -459,7 +459,8 @@ def _normalize_archs( if len(normalized_arch) != len(architectures): # The order matters. If causal comes first, checks on MM model fails because it is not registered in MultimodalRegistry # TODO: needs help from vLLM team - normalized_arch.extend(["TransformersForMultimodalLM", "TransformersForCausalLM"]) + normalized_arch.extend( + ["TransformersForMultimodalLM", "TransformersForCausalLM"]) return normalized_arch def inspect_model_cls( diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 8e90b3986ee..76b8b007d3f 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -14,11 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Wrapper around `transformers` models""" -import re -from itertools import chain from collections.abc import Iterable from contextlib import contextmanager, nullcontext -from typing import Iterable, Literal, Optional, Union +from typing import Literal, Optional, Union import regex as re import torch @@ -42,15 +40,18 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalInputs, + PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + BaseProcessingInfo) +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processor import cached_get_processor -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry, MultiModalKwargs -from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs -from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalInputs, PlaceholderRange -from vllm.multimodal.parse import ImageProcessorItems, ImageSize -from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant, SupportsMultiModal +from .interfaces import (SupportsLoRA, SupportsMultiModal, SupportsPP, + SupportsQuant) from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, maybe_prefix) @@ -124,7 +125,7 @@ def replace_linear_class( def init_on_device_without_buffers(device: torch.device): """ A context manager under which models are initialized with all parameters on the specified device. - However buffers are not initalized on specified device. + However buffers are not initialized on specified device. Args: device (`torch.device`): @@ -139,28 +140,35 @@ def register_empty_parameter(module, name, param): param_cls = type(module._parameters[name]) kwargs = module._parameters[name].__dict__ kwargs["requires_grad"] = param.requires_grad - module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs) + module._parameters[name] = param_cls( + module._parameters[name].to(device), **kwargs) tensor_constructors_to_patch = {} def patch_tensor_constructor(fn): + def wrapper(*args, **kwargs): kwargs["device"] = device return fn(*args, **kwargs) + return wrapper try: nn.Module.register_parameter = register_empty_parameter - for torch_function_name in tensor_constructors_to_patch.keys(): - setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name))) + for torch_function_name in tensor_constructors_to_patch: + setattr( + torch, torch_function_name, + patch_tensor_constructor(getattr(torch, torch_function_name))) yield finally: nn.Module.register_parameter = old_register_parameter - for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(): + for torch_function_name, old_torch_function in tensor_constructors_to_patch.items( + ): setattr(torch, torch_function_name, old_torch_function) class MultiModalProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): return self.ctx.model_config.hf_config @@ -174,7 +182,8 @@ def get_max_image_tokens(self) -> int: width, height = self.get_max_image_size() processor = self.get_hf_processor() mm_processor_kwargs = self.ctx.model_config.mm_processor_kwargs or {} - mm_tokens = processor._get_num_multimodal_tokens(image_sizes=([height, width],), **mm_processor_kwargs) + mm_tokens = processor._get_num_multimodal_tokens( + image_sizes=([height, width], ), **mm_processor_kwargs) image_tokens = mm_tokens["num_image_tokens"][0] return image_tokens @@ -183,10 +192,11 @@ def get_hf_processor(self): return processor def get_max_image_size(self): - return 10_000, 10_000 # hardcode for arbitrary very large size + return 10_000, 10_000 # hardcode for arbitrary very large size class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder): + def get_dummy_processor_inputs( self, seq_len, @@ -196,20 +206,19 @@ def get_dummy_processor_inputs( processor = self.info.get_hf_processor() if "gemma3" in processor.__class__.__name__.lower(): - image_token = getattr(processor, "boi_token") + image_token = processor.boi_token else: image_token = getattr(processor, "image_token", "") target_width, target_height = self.info.get_max_image_size() mm_data = { - "image": self._get_dummy_images( - width=target_width, - height=target_height, - num_images=1 - ), + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=1), } - prompt_text = image_token*num_images + prompt_text = image_token * num_images return ProcessorInputs( prompt_text=prompt_text, mm_data=mm_data, @@ -217,6 +226,7 @@ def get_dummy_processor_inputs( class MultiModalProcessor(BaseMultiModalProcessor): + def _get_prompt_updates( self, mm_items, @@ -244,9 +254,16 @@ def _get_mm_fields_config( hf_processor_mm_kwargs, num_image_patches: torch.Tensor = None, ): - hf_inputs.pop("attention_mask", None) # processors always return a mask but vLLM doesn't need it - mm_fields = {key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) for key in hf_inputs.keys()} - mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) + hf_inputs.pop( + "attention_mask", + None) # processors always return a mask but vLLM doesn't need it + mm_fields = { + key: MultiModalFieldConfig.flat_from_sizes("image", + num_image_patches) + for key in hf_inputs.keys() + } + mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes( + "image", num_image_patches) mm_fields["num_image_patches"] = MultiModalFieldConfig.batched("image") return mm_fields @@ -273,7 +290,10 @@ def _apply_hf_processor_text_mm( processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() - mm_token_type_ids = processed_data.pop("mm_token_type_ids") if "mm_token_type_ids" in processed_data else processed_data.pop("token_type_ids") # for gemma3 only + mm_token_type_ids = processed_data.pop( + "mm_token_type_ids" + ) if "mm_token_type_ids" in processed_data else processed_data.pop( + "token_type_ids") # for gemma3 only return prompt_ids, processed_data, mm_token_type_ids @@ -282,7 +302,7 @@ def apply( prompt, mm_data, hf_processor_mm_kwargs, - return_mm_hashes = False, + return_mm_hashes=False, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -293,8 +313,7 @@ def apply( if return_mm_hashes: raise ValueError( "TransformersMultimodalLM doesn't support mm hashing yet! Probably you did not set " - "`disable_mm_preprocessor_cache=True`." - ) + "`disable_mm_preprocessor_cache=True`.") mm_items = self._to_mm_items(mm_data) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) @@ -316,7 +335,8 @@ def apply( image_size = images.get_image_size(item_idx) image_sizes.append((image_size.height, image_size.width)) - mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens(image_sizes=image_sizes, **mm_processor_kwargs) + mm_tokens_per_modality = hf_processor._get_num_multimodal_tokens( + image_sizes=image_sizes, **mm_processor_kwargs) mm_placeholders = {} split_sizes = mm_tokens_per_modality["num_image_tokens"] @@ -325,16 +345,23 @@ def apply( mm_tokens = torch.tensor(prompt_ids)[mm_token_type_ids[0].bool()] chunked_mm_tokens = torch.split(mm_tokens, split_sizes) ranges = [ - PlaceholderRange(offset=positions[0].item(), length=positions.shape[0], is_embed=(mm_tokens == hf_processor.image_token_id).bool()) - for positions, mm_tokens in zip(chunked_mm_positions, chunked_mm_tokens) + PlaceholderRange( + offset=positions[0].item(), + length=positions.shape[0], + is_embed=(mm_tokens == hf_processor.image_token_id).bool()) + for positions, mm_tokens in zip(chunked_mm_positions, + chunked_mm_tokens) ] mm_placeholders = {"image": ranges} - num_image_patches = torch.tensor(mm_tokens_per_modality["num_image_patches"]) if "num_image_patches" in mm_tokens_per_modality else None + num_image_patches = torch.tensor( + mm_tokens_per_modality["num_image_patches"] + ) if "num_image_patches" in mm_tokens_per_modality else None processed_data['num_image_patches'] = num_image_patches mm_kwargs = MultiModalKwargs.from_hf_inputs( processed_data, - self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, num_image_patches), + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs, + num_image_patches), ) return MultiModalInputs( @@ -346,6 +373,7 @@ def apply( mm_placeholders=mm_placeholders, ) + class ConfigOverride: """Context manager to temporarily override config attributes.""" @@ -415,7 +443,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # weights mapper to rename weights. self.model: PreTrainedModel = AutoModel.from_config( config, - attn_implementation={"text_config": "vllm", "vision_config": "eager"}, + attn_implementation={ + "text_config": "vllm", + "vision_config": "eager" + }, torch_dtype=model_config.dtype, trust_remote_code=model_config.trust_remote_code, ) @@ -473,14 +504,15 @@ def pipeline_parallel(self): # Layers before module list for name in pp_plan[:module_list_idx]: - if self.pp_group.is_first_rank or (self.text_config.tie_word_embeddings - and self.pp_group.is_last_rank): + if self.pp_group.is_first_rank or ( + self.text_config.tie_word_embeddings + and self.pp_group.is_last_rank): continue setattr(self.model, name, PPMissingLayer()) # Module list - start_layer, end_layer = get_pp_indices(self.text_config.num_hidden_layers, - self.pp_rank, self.pp_size) + start_layer, end_layer = get_pp_indices( + self.text_config.num_hidden_layers, self.pp_rank, self.pp_size) layers_name = pp_plan[module_list_idx] layers = getattr(self.model, layers_name) for i in range(len(layers)): @@ -559,7 +591,8 @@ def create_attention_instances(self) -> dict[int, Attention]: def meta_to_empty(self, module: nn.Module): for name, param in module.named_parameters(recurse=False): if param.device == torch.device("meta"): - new_param = torch.empty_like(param, device=self.device_config.device) + new_param = torch.empty_like(param, + device=self.device_config.device) new_param = type(param)(new_param) module._parameters[name] = new_param for child in module.children(): @@ -629,7 +662,8 @@ def load_weights(self, weights: Iterable[tuple[str, class TransformersForCausalLM(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP): embedding_padding_modules = ["lm_head"] - embedding_modules = ["embed_tokens"] # TODO transformers will have a util to get it + embedding_modules = ["embed_tokens" + ] # TODO transformers will have a util to get it def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -706,11 +740,12 @@ def load_weights(self, weights: Iterable[tuple[str, return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) -@MULTIMODAL_REGISTRY.register_processor(MultiModalProcessor, - info=MultiModalProcessingInfo, - dummy_inputs=MultiModalDummyInputsBuilder) +@MULTIMODAL_REGISTRY.register_processor( + MultiModalProcessor, + info=MultiModalProcessingInfo, + dummy_inputs=MultiModalDummyInputsBuilder) class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA, - SupportsPP, SupportsMultiModal): + SupportsPP, SupportsMultiModal): embedding_padding_modules = ["lm_head"] embedding_modules = ["embed_tokens"] @@ -766,9 +801,7 @@ def hf_to_vllm_mapper(self): prefix_mapper["model"] = "model.language_model" prefix_mapper["visual"] = "model.visual" - return WeightsMapper( - orig_to_new_prefix=prefix_mapper, - ) + return WeightsMapper(orig_to_new_prefix=prefix_mapper, ) def forward( self, @@ -794,14 +827,16 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, - skip_prefixes=(["lm_head."] - if self.config.get_text_config().tie_word_embeddings else None), + skip_prefixes=([ + "lm_head." + ] if self.config.get_text_config().tie_word_embeddings else None), ) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) def get_multimodal_embeddings(self, **kwargs): pixel_values = kwargs.pop("pixel_values", None) - pixel_values = pixel_values if pixel_values is not None else kwargs.pop("image_patches", None) + pixel_values = pixel_values if pixel_values is not None else kwargs.pop( + "image_patches", None) image_embeds = kwargs.pop("image_embeds", None) num_image_patches = kwargs.pop("num_image_patches") @@ -819,9 +854,12 @@ def get_multimodal_embeddings(self, **kwargs): num_image_patches = torch.cat(num_image_patches).flatten() vision_embeddings = self.model.model.get_image_features( - pixel_values, - **{k: v.flatten(0, 1) for k, v in kwargs.items()}, - ) + pixel_values, + **{ + k: v.flatten(0, 1) + for k, v in kwargs.items() + }, + ) if isinstance(vision_embeddings, torch.Tensor): if vision_embeddings.ndim == 2: @@ -830,8 +868,12 @@ def get_multimodal_embeddings(self, **kwargs): # Embeddings have to be 2D tensors of length `num_images` but transformers # returns concat tensors if each patch is of different size. We split it back # to make vLLM assertions happy - vision_embeddings = torch.split(vision_embeddings, num_image_patches.tolist()) - vision_embeddings = [embed.flatten(start_dim=0, end_dim=-2) for embed in vision_embeddings] + vision_embeddings = torch.split(vision_embeddings, + num_image_patches.tolist()) + vision_embeddings = [ + embed.flatten(start_dim=0, end_dim=-2) + for embed in vision_embeddings + ] return vision_embeddings @@ -841,7 +883,7 @@ def get_multimodal_embeddings(self, **kwargs): def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings = None, + multimodal_embeddings=None, ) -> torch.Tensor: inputs_embeds = self.model.model.get_input_embeddings()(input_ids) if multimodal_embeddings is not None: @@ -849,5 +891,6 @@ def get_input_embeddings( mask = mask.unsqueeze(-1).expand_as(inputs_embeds) multimodal_embeddings = torch.cat(multimodal_embeddings) - inputs_embeds = inputs_embeds.masked_scatter(mask, multimodal_embeddings) - return inputs_embeds \ No newline at end of file + inputs_embeds = inputs_embeds.masked_scatter( + mask, multimodal_embeddings) + return inputs_embeds diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index c301b06ed89..b0780a97357 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -814,12 +814,17 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: def get_hf_inputs(self, modality: str) -> dict[str, NestedTensors]: modality_items = self._items_by_modality.get(modality, None) hf_inputs = defaultdict[str, list[NestedTensors]](list) - if modality_items is not None: + if modality_items is not None: for mm_kwargs_item in modality_items: for key, value in mm_kwargs_item.items(): hf_inputs[key].append(value.data) - hf_inputs = {key: torch.stack(value) for key, value in hf_inputs.items()} - return hf_inputs + + hf_inputs_as_tensors = { + key: torch.stack(value) + for key, value in hf_inputs.items() + } + return hf_inputs_as_tensors + MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] """ From bf08a9e0ee59d47392fec7513fe7a3bdf0245b14 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 29 May 2025 18:04:08 +0200 Subject: [PATCH 12/23] need to update dummy builder after rebase --- vllm/model_executor/models/transformers.py | 38 ++++++++++------------ 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 76b8b007d3f..58e4c0585a2 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Wrapper around `transformers` models""" -from collections.abc import Iterable +from collections.abc import Iterable, Mapping from contextlib import contextmanager, nullcontext from typing import Literal, Optional, Union @@ -42,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalInputs, - PlaceholderRange) + PlaceholderRange, MultiModalDataDict) from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) @@ -197,11 +197,7 @@ def get_max_image_size(self): class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder): - def get_dummy_processor_inputs( - self, - seq_len, - mm_counts, - ) -> ProcessorInputs: + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_images = mm_counts.get("image", 0) processor = self.info.get_hf_processor() @@ -209,21 +205,24 @@ def get_dummy_processor_inputs( image_token = processor.boi_token else: image_token = getattr(processor, "image_token", "") + return image_token * num_images + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + target_width, target_height = self.info.get_max_image_size() - mm_data = { + return { "image": self._get_dummy_images(width=target_width, height=target_height, - num_images=1), + num_images=num_images), } - prompt_text = image_token * num_images - return ProcessorInputs( - prompt_text=prompt_text, - mm_data=mm_data, - ) - class MultiModalProcessor(BaseMultiModalProcessor): @@ -436,17 +435,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config_override = ConfigOverride( config, sliding_window=config.interleaved_sliding_window) - # Use meta device to delay allocating GPU tensors + # Set correct attn impl and init on "meta" to delay allocating GPU tensors + self.text_config._attn_implementation = "vllm" with init_on_device_without_buffers("meta"): # FIXME(Isotr0py): We need to refactor this part in the future to # avoid registering an extra model layer, otherwise we will need a # weights mapper to rename weights. self.model: PreTrainedModel = AutoModel.from_config( config, - attn_implementation={ - "text_config": "vllm", - "vision_config": "eager" - }, torch_dtype=model_config.dtype, trust_remote_code=model_config.trust_remote_code, ) @@ -589,7 +585,7 @@ def create_attention_instances(self) -> dict[int, Attention]: return attention_instances def meta_to_empty(self, module: nn.Module): - for name, param in module.named_parameters(recurse=False): + for name, param in module.named_parameters(recurse=False): if param.device == torch.device("meta"): new_param = torch.empty_like(param, device=self.device_config.device) From ba1143a9361b188ef194b68d2e0ee3dfc73d2ce0 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 29 May 2025 18:07:49 +0200 Subject: [PATCH 13/23] delet meta to device --- vllm/model_executor/models/transformers.py | 27 ++++++++++++++-------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 58e4c0585a2..9e67d8841d0 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -464,8 +464,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Attention layers self.attention_instances = self.create_attention_instances() - # Move meta tensors to device (should happen last) - self.meta_to_empty(self.model) + # Initialize any parameters that have not had their modules replaced + self.init_parameters(self.model) self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory(["hidden_states"], @@ -584,15 +584,24 @@ def create_attention_instances(self) -> dict[int, Attention]: prefix=f"{i}.attn") return attention_instances - def meta_to_empty(self, module: nn.Module): - for name, param in module.named_parameters(recurse=False): + def init_parameters(self, module: nn.Module): + """ + If a `parameter` is on the `meta` device, then its parent + `module` is the original module created by: + + ```python + with torch.device("meta"): + self.model: PreTrainedModel = AutoModel.from_config(...) + ``` + """ + for name, param in module.named_parameters(recurse=False): if param.device == torch.device("meta"): - new_param = torch.empty_like(param, - device=self.device_config.device) - new_param = type(param)(new_param) - module._parameters[name] = new_param + new_param = nn.Parameter( + torch.empty_like(param.data, + device=self.device_config.device)) + setattr(module, name, new_param) for child in module.children(): - self.meta_to_empty(child) + self.init_parameters(child) def get_input_embeddings(self) -> nn.Module: return self.model.get_input_embeddings() From 267a57f2fe56ef2b85fee88018ff2cc51137eff6 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 30 May 2025 11:58:03 +0200 Subject: [PATCH 14/23] add tests --- tests/models/test_transformers.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 1a51b4aeab0..0244cec2995 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -10,6 +10,7 @@ from ..core.block.e2e.test_correctness_sliding_window import prep_prompts from ..utils import multi_gpu_test from .utils import check_logprobs_close +from transformers import AutoModelForImageTextToText def check_implementation( @@ -71,6 +72,27 @@ def test_models( model_impl=model_impl) +@pytest.mark.parametrize( + "model,model_impl", + [ + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"), # dynamic image length and number of patches + ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"), # has col/row special token between patches + ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"), # pixel values from processor are not 4D or 5D arraya + ]) # no custom code support because custom models don't follow the standard yet! +def test_models_multimodal( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + example_prompts: list[str], + model: str, + model_impl: str, +) -> None: + check_implementation(hf_runner, + vllm_runner, + example_prompts, + model, + model_impl=model_impl, + kwargs_ref={"auto_cls": AutoModelForImageTextToText},) + def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None: prompts, _, _ = prep_prompts(4, (800, 801)) kwargs_ref = {"max_model_len": 8192, "enforce_eager": True} From 2c73f8843bc1b2e0e8478d2714bd407ce9497b17 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 2 Jun 2025 17:07:02 +0200 Subject: [PATCH 15/23] style Signed-off-by: raushan --- tests/models/test_transformers.py | 29 ++++++++------ vllm/model_executor/models/registry.py | 6 +-- vllm/model_executor/models/transformers.py | 44 +++++++++++----------- vllm/v1/engine/mm_input_cache.py | 9 +++-- 4 files changed, 50 insertions(+), 38 deletions(-) diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 0244cec2995..efd45e5b93b 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -3,6 +3,7 @@ from typing import Any, Optional, Union import pytest +from transformers import AutoModelForImageTextToText from vllm.platforms import current_platform @@ -10,7 +11,6 @@ from ..core.block.e2e.test_correctness_sliding_window import prep_prompts from ..utils import multi_gpu_test from .utils import check_logprobs_close -from transformers import AutoModelForImageTextToText def check_implementation( @@ -75,10 +75,14 @@ def test_models( @pytest.mark.parametrize( "model,model_impl", [ - ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"), # dynamic image length and number of patches - ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"), # has col/row special token between patches - ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"), # pixel values from processor are not 4D or 5D arraya - ]) # no custom code support because custom models don't follow the standard yet! + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + "transformers"), # dynamic image length and number of patches + ("HuggingFaceTB/SmolVLM-256M-Instruct", + "transformers"), # has col/row special token between patches + ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers" + ), # pixel values from processor are not 4D or 5D arraya + ] +) # no custom code support because custom models don't follow the standard yet! def test_models_multimodal( hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], @@ -86,12 +90,15 @@ def test_models_multimodal( model: str, model_impl: str, ) -> None: - check_implementation(hf_runner, - vllm_runner, - example_prompts, - model, - model_impl=model_impl, - kwargs_ref={"auto_cls": AutoModelForImageTextToText},) + check_implementation( + hf_runner, + vllm_runner, + example_prompts, + model, + model_impl=model_impl, + kwargs_ref={"auto_cls": AutoModelForImageTextToText}, + ) + def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None: prompts, _, _ = prep_prompts(4, (800, 801)) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 2241f0bab55..b4465f4722e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -231,7 +231,7 @@ } _TRANSFORMERS_MODELS = { - "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), + "TransformersForMultimodalLM": ("transformers", "TransformersForMultimodalLM"), # noqa: E501 "TransformersForCausalLM": ("transformers", "TransformersForCausalLM"), } # yapf: enable @@ -457,8 +457,8 @@ def _normalize_archs( # make sure Transformers backend is put at the last as a fallback if len(normalized_arch) != len(architectures): - # The order matters. If causal comes first, checks on MM model fails because it is not registered in MultimodalRegistry - # TODO: needs help from vLLM team + # The order matters. If the CausalLM comes first, then checks for + # registered model in MultimodalRegistry fail normalized_arch.extend( ["TransformersForMultimodalLM", "TransformersForCausalLM"]) return normalized_arch diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 9e67d8841d0..333a8837f0e 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -41,12 +41,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalInputs, - PlaceholderRange, MultiModalDataDict) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputs, PlaceholderRange) from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, BaseProcessingInfo) -from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.sequence import IntermediateTensors from vllm.transformers_utils.processor import cached_get_processor @@ -124,8 +124,9 @@ def replace_linear_class( @contextmanager def init_on_device_without_buffers(device: torch.device): """ - A context manager under which models are initialized with all parameters on the specified device. - However buffers are not initialized on specified device. + A context manager under which models are initialized with all + parameters on the specified device. However buffers are not + initialized on specified device. Args: device (`torch.device`): @@ -162,8 +163,7 @@ def wrapper(*args, **kwargs): yield finally: nn.Module.register_parameter = old_register_parameter - for torch_function_name, old_torch_function in tensor_constructors_to_patch.items( - ): + for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(): setattr(torch, torch_function_name, old_torch_function) @@ -216,7 +216,7 @@ def get_dummy_mm_data( target_width, target_height = self.info.get_max_image_size() - return { + return { "image": self._get_dummy_images(width=target_width, height=target_height, @@ -253,13 +253,11 @@ def _get_mm_fields_config( hf_processor_mm_kwargs, num_image_patches: torch.Tensor = None, ): - hf_inputs.pop( - "attention_mask", - None) # processors always return a mask but vLLM doesn't need it + # HF Processors always return a mask but vLLM doesn't need it + hf_inputs.pop("attention_mask", None) mm_fields = { - key: MultiModalFieldConfig.flat_from_sizes("image", - num_image_patches) - for key in hf_inputs.keys() + key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) + for key in hf_inputs } mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes( "image", num_image_patches) @@ -311,13 +309,17 @@ def apply( """ if return_mm_hashes: raise ValueError( - "TransformersMultimodalLM doesn't support mm hashing yet! Probably you did not set " - "`disable_mm_preprocessor_cache=True`.") + "TransformersMultimodalLM doesn't support mm hashing yet! " + "Probably you did not set `disable_mm_preprocessor_cache=True`") mm_items = self._to_mm_items(mm_data) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - prompt_ids, processed_data, mm_token_type_ids = self._apply_hf_processor_text_mm( + ( + prompt_ids, + processed_data, + mm_token_type_ids + ) = self._apply_hf_processor_text_mm( prompt_text=prompt, mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, @@ -435,7 +437,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config_override = ConfigOverride( config, sliding_window=config.interleaved_sliding_window) - # Set correct attn impl and init on "meta" to delay allocating GPU tensors + # Set correct attn and init on "meta" to delay allocating GPU tensors self.text_config._attn_implementation = "vllm" with init_on_device_without_buffers("meta"): # FIXME(Isotr0py): We need to refactor this part in the future to @@ -870,9 +872,9 @@ def get_multimodal_embeddings(self, **kwargs): if vision_embeddings.ndim == 2: vision_embeddings = vision_embeddings.unsqueeze(0) - # Embeddings have to be 2D tensors of length `num_images` but transformers - # returns concat tensors if each patch is of different size. We split it back - # to make vLLM assertions happy + # Embeddings have to be 2D tensors of length `num_images` + # but transformers returns concat tensors if each patch + # is of different size. We split it back to make vLLM happy vision_embeddings = torch.split(vision_embeddings, num_image_patches.tolist()) vision_embeddings = [ diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 3631f4d59fa..7ed0fa48327 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -34,10 +34,13 @@ class MirroredProcessingCache: def __init__(self, model_config): mm_config = model_config.multimodal_config - disable_mm_preprocessor_cache = mm_config is not None and mm_config.disable_mm_preprocessor_cache + disable_mm_preprocessor_cache = ( + mm_config is not None and mm_config.disable_mm_preprocessor_cache + ) self.use_cache = not disable_mm_preprocessor_cache - self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, - MultiModalKwargs) + self.mm_cache = ProcessingCache.get_lru_cache( + VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs + ) def get_and_update_p0( self, From 8c1f220c7680d63191f3f7dbbe319ca7bd3ff46c Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 2 Jun 2025 17:26:48 +0200 Subject: [PATCH 16/23] i dont get the style guidelines Signed-off-by: raushan --- vllm/model_executor/models/transformers.py | 15 +++++++-------- vllm/v1/engine/mm_input_cache.py | 8 +++----- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 333a8837f0e..3518baff9fb 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -163,7 +163,8 @@ def wrapper(*args, **kwargs): yield finally: nn.Module.register_parameter = old_register_parameter - for torch_function_name, old_torch_function in tensor_constructors_to_patch.items(): + for torch_function_name, old_torch_function in tensor_constructors_to_patch.items( + ): setattr(torch, torch_function_name, old_torch_function) @@ -256,7 +257,8 @@ def _get_mm_fields_config( # HF Processors always return a mask but vLLM doesn't need it hf_inputs.pop("attention_mask", None) mm_fields = { - key: MultiModalFieldConfig.flat_from_sizes("image", num_image_patches) + key: MultiModalFieldConfig.flat_from_sizes("image", + num_image_patches) for key in hf_inputs } mm_fields["image_embeds"] = MultiModalFieldConfig.flat_from_sizes( @@ -310,16 +312,13 @@ def apply( if return_mm_hashes: raise ValueError( "TransformersMultimodalLM doesn't support mm hashing yet! " - "Probably you did not set `disable_mm_preprocessor_cache=True`") + "Probably you didn't set `disable_mm_preprocessor_cache=True`") mm_items = self._to_mm_items(mm_data) hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) - ( - prompt_ids, - processed_data, - mm_token_type_ids - ) = self._apply_hf_processor_text_mm( + (prompt_ids, processed_data, + mm_token_type_ids) = self._apply_hf_processor_text_mm( prompt_text=prompt, mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 7ed0fa48327..45fb5cd23f6 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -35,12 +35,10 @@ class MirroredProcessingCache: def __init__(self, model_config): mm_config = model_config.multimodal_config disable_mm_preprocessor_cache = ( - mm_config is not None and mm_config.disable_mm_preprocessor_cache - ) + mm_config is not None and mm_config.disable_mm_preprocessor_cache) self.use_cache = not disable_mm_preprocessor_cache - self.mm_cache = ProcessingCache.get_lru_cache( - VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs - ) + self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, + MultiModalKwargs) def get_and_update_p0( self, From 8d5d67ea6c72ce69528e662c0dbc9c18f53a340f Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 3 Jun 2025 12:35:20 +0200 Subject: [PATCH 17/23] Update vllm/model_executor/models/transformers.py Co-authored-by: Cyrus Leung --- vllm/model_executor/models/transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 3518baff9fb..5ce51647d4e 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -311,7 +311,7 @@ def apply( """ if return_mm_hashes: raise ValueError( - "TransformersMultimodalLM doesn't support mm hashing yet! " + "TransformersForMultimodalLM doesn't support mm hashing yet! " "Probably you didn't set `disable_mm_preprocessor_cache=True`") mm_items = self._to_mm_items(mm_data) From be850dc7807c0b8e6f720f50d22b551ab7d97347 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 3 Jun 2025 12:47:34 +0200 Subject: [PATCH 18/23] address some comments Signed-off-by: raushan --- tests/models/test_transformers.py | 14 ++++++++------ vllm/model_executor/model_loader/utils.py | 16 ++++++++++++++-- vllm/model_executor/models/transformers.py | 10 +++++----- vllm/v1/engine/mm_input_cache.py | 4 ++-- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index efd45e5b93b..1e242b66af3 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -75,12 +75,14 @@ def test_models( @pytest.mark.parametrize( "model,model_impl", [ - ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - "transformers"), # dynamic image length and number of patches - ("HuggingFaceTB/SmolVLM-256M-Instruct", - "transformers"), # has col/row special token between patches - ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers" - ), # pixel values from processor are not 4D or 5D arraya + # Dynamic image length and number of patches + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "transformers"), + # Has col/row special token between patches + ("HuggingFaceTB/SmolVLM-256M-Instruct", "transformers"), + # Pixel values from processor are not 4D or 5D arrays + ("Qwen/Qwen2.5-VL-3B-Instruct", "transformers"), + # Check "auto" with fallback to transformers + ("BAAI/Emu3-Chat-hf", "auto"), ] ) # no custom code support because custom models don't follow the standard yet! def test_models_multimodal( diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index 213e7b7b682..63bfa6766f9 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -200,7 +200,13 @@ def resolve_transformers_arch(model_config: ModelConfig, raise ValueError( f"The Transformers implementation of {arch} is not " "compatible with vLLM.") - architectures[i] = "TransformersForMultimodalLM" + # Check if text-config is `self`. If not most probably it is + # a composite config, i.e. mutlimodal + if model_config.hf_config.get_text_config( + ) != model_config.hf_config: + architectures[i] = "TransformersForMultimodalLM" + else: + architectures[i] = "TransformersForCausalLM" if model_config.model_impl == ModelImpl.AUTO: if not model_module.is_backend_compatible(): raise ValueError( @@ -211,7 +217,13 @@ def resolve_transformers_arch(model_config: ModelConfig, "%s has no vLLM implementation, falling back to Transformers " "implementation. Some features may not be supported and " "performance may not be optimal.", arch) - architectures[i] = "TransformersForMultimodalLM" + # Check if text-config is `self`. If not most probably it is + # a composite config, i.e. mutlimodal + if model_config.hf_config.get_text_config( + ) != model_config.hf_config: + architectures[i] = "TransformersForMultimodalLM" + else: + architectures[i] = "TransformersForCausalLM" return architectures diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 5ce51647d4e..d3fd82ebd7c 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -318,11 +318,11 @@ def apply( hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) (prompt_ids, processed_data, - mm_token_type_ids) = self._apply_hf_processor_text_mm( - prompt_text=prompt, - mm_items=mm_items, - hf_processor_mm_kwargs=hf_processor_mm_kwargs, - ) + mm_token_type_ids) = self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) # HF processor will return `mm_token_type_ids` from which # we can infer mm_placeholders. Until then hardcode to make code run diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 45fb5cd23f6..fcb90bebdb6 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -34,8 +34,8 @@ class MirroredProcessingCache: def __init__(self, model_config): mm_config = model_config.multimodal_config - disable_mm_preprocessor_cache = ( - mm_config is not None and mm_config.disable_mm_preprocessor_cache) + disable_mm_preprocessor_cache = mm_config is not None and \ + not mm_config.disable_mm_preprocessor_cache self.use_cache = not disable_mm_preprocessor_cache self.mm_cache = ProcessingCache.get_lru_cache(VLLM_MM_INPUT_CACHE_GIB, MultiModalKwargs) From e7303234a535f3449f410feeaf757b4cf9dda725 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 3 Jun 2025 15:10:48 +0200 Subject: [PATCH 19/23] forgot to add `@support_torch_compile` decorator Signed-off-by: raushan --- vllm/model_executor/models/transformers.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index d3fd82ebd7c..a83c7dd720d 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -750,6 +750,7 @@ def load_weights(self, weights: Iterable[tuple[str, MultiModalProcessor, info=MultiModalProcessingInfo, dummy_inputs=MultiModalDummyInputsBuilder) +@support_torch_compile class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP, SupportsMultiModal): embedding_padding_modules = ["lm_head"] @@ -790,6 +791,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @property def hf_to_vllm_mapper(self): + # Backwards compatibility for prev released models + # State dicts back then had different formats + # and cannot be loaded with `AutoModel` mapping + # as is prefix_mapper = { "language_model.model": "model.language_model", "text_model.model": "model.text_model", From cfa199887757fd820d52ba855a1d014c0abd2a44 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 4 Jun 2025 10:35:20 +0200 Subject: [PATCH 20/23] cant compile yet + clean up commented code Signed-off-by: raushan --- vllm/model_executor/models/registry.py | 1 - vllm/model_executor/models/transformers.py | 14 ++++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b4465f4722e..651152e5cc8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -185,7 +185,6 @@ "GraniteSpeechForConditionalGeneration": ("granite_speech", "GraniteSpeechForConditionalGeneration"), # noqa: E501 "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"), "InternVLChatModel": ("internvl", "InternVLChatModel"), - # "InternVLForConditionalGeneration": ("internvl", "InternVLForConditionalGeneration"), # noqa: E501 "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"), "SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501 "KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501 diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index a83c7dd720d..891f50a02a5 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -174,10 +174,10 @@ def get_hf_config(self): return self.ctx.model_config.hf_config def get_supported_mm_limits(self): - return {"image": None, "video": None} + return {"image": None} def get_mm_max_tokens_per_item(self, seq_len, mm_counts): - return {"image": self.get_max_image_tokens(), "video": 0} + return {"image": self.get_max_image_tokens()} def get_max_image_tokens(self) -> int: width, height = self.get_max_image_size() @@ -750,7 +750,6 @@ def load_weights(self, weights: Iterable[tuple[str, MultiModalProcessor, info=MultiModalProcessingInfo, dummy_inputs=MultiModalDummyInputsBuilder) -@support_torch_compile class TransformersForMultimodalLM(nn.Module, SupportsQuant, SupportsLoRA, SupportsPP, SupportsMultiModal): embedding_padding_modules = ["lm_head"] @@ -857,12 +856,11 @@ def get_multimodal_embeddings(self, **kwargs): if pixel_values is not None: if isinstance(pixel_values, torch.Tensor): pixel_values = pixel_values.flatten(0, 1).to(self.dtype) - if isinstance(num_image_patches, list): - num_image_patches = torch.cat(num_image_patches) - num_image_patches = num_image_patches.flatten() else: pixel_values = torch.cat(pixel_values).to(self.dtype) - num_image_patches = torch.cat(num_image_patches).flatten() + + if isinstance(num_image_patches, list): + num_image_patches = torch.cat(num_image_patches) vision_embeddings = self.model.model.get_image_features( pixel_values, @@ -880,7 +878,7 @@ def get_multimodal_embeddings(self, **kwargs): # but transformers returns concat tensors if each patch # is of different size. We split it back to make vLLM happy vision_embeddings = torch.split(vision_embeddings, - num_image_patches.tolist()) + num_image_patches.flatten().tolist()) vision_embeddings = [ embed.flatten(start_dim=0, end_dim=-2) for embed in vision_embeddings From 52bda05073152bcea6e7f6e4359c4532aa52e5a9 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Mon, 16 Jun 2025 18:00:49 +0800 Subject: [PATCH 21/23] fix param dtype Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/transformers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 891f50a02a5..533c51457b7 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -599,6 +599,7 @@ def init_parameters(self, module: nn.Module): if param.device == torch.device("meta"): new_param = nn.Parameter( torch.empty_like(param.data, + dtype=self.model_config.dtype, device=self.device_config.device)) setattr(module, name, new_param) for child in module.children(): From 6ef7b35c7c0f837bdf77f961a5bb730af2c72ec7 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 17 Jun 2025 13:57:32 +0200 Subject: [PATCH 22/23] mention VLMs in the docs --- docs/models/supported_models.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 60f7dacebfa..90d541e7ec4 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -21,7 +21,7 @@ These models are what we list in [supported-text-models][supported-text-models] ### Transformers -vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned! +vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models and common vision language models are supported! To check if the modeling backend is Transformers, you can simply do this: @@ -31,14 +31,17 @@ llm = LLM(model=..., task="generate") # Name or path of your model llm.apply_model(lambda model: print(type(model))) ``` -If it is `TransformersForCausalLM` then it means it's based on Transformers! +If it is `TransformersForCausalLM` or `TransformersForMultimodalLM` then it means it's based on Transformers! !!! tip - You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server]. + You can force the use of `Transformers` model by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server]. !!! note vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM. +!!! note + In case of vision language models if you are loading with `dtype="auto"`, vLLM loads the whole model with config's `dtype` if it exists. In contrast the native Trasnformers will respect the `dtype` attribute of each backbone in the model. That might cause a slight difference in performance. + #### Custom models If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM! @@ -97,7 +100,7 @@ Here is what happens in the background when this model is loaded: 1. The config is loaded. 2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`. -3. `MyModel` is loaded into `TransformersForCausalLM` (see ) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. +3. `MyModel` is loaded into `TransformersForCausalLM` or `TransformersForMultimodalLM` (see ) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used. That's it! From d1e6d956dbb8b68c1e18f0182d1dc7a468b9c7ee Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 18 Jun 2025 16:22:36 +0800 Subject: [PATCH 23/23] v0 backward compatibility Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/transformers.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index c092a40ff64..24df03363e1 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -821,7 +821,17 @@ def forward( positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, + **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: + # NOTE: In v1, inputs_embeds is always generated at model runner from + # `get_multimodal_embeddings` and `get_input_embeddings`, this + # condition is only for v0 compatibility. + if inputs_embeds is None: + multimodal_embeds = self.get_multimodal_embeddings(**kwargs) + if multimodal_embeds is not None: + inputs_embeds = self.get_input_embeddings(input_ids, multimodal_embeds) + input_ids = None + model_output = self.model(input_ids, positions, intermediate_tensors, inputs_embeds) return model_output @@ -850,11 +860,11 @@ def get_multimodal_embeddings(self, **kwargs): pixel_values = pixel_values if pixel_values is not None else kwargs.pop( "image_patches", None) image_embeds = kwargs.pop("image_embeds", None) - num_image_patches = kwargs.pop("num_image_patches") if pixel_values is None and image_embeds is None: return None + num_image_patches = kwargs.pop("num_image_patches") if pixel_values is not None: if isinstance(pixel_values, torch.Tensor): pixel_values = pixel_values.flatten(0, 1).to(self.dtype)