From f36c4f91c6de67d724e6a9d090cea724f4384b0b Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 24 Mar 2025 15:59:38 -0300 Subject: [PATCH 01/63] Remove guardrails that prevent V1 from trying to run embedding models Signed-off-by: Max de Bayser --- vllm/engine/arg_utils.py | 6 --- vllm/model_executor/models/bert.py | 53 ++++++++++++++++++------ vllm/model_executor/models/roberta.py | 52 ++++++++++++----------- vllm/v1/attention/backends/flash_attn.py | 28 ++++++++++--- vllm/v1/engine/async_llm.py | 6 ++- vllm/v1/engine/core.py | 2 - vllm/v1/worker/gpu_model_runner.py | 3 +- 7 files changed, 98 insertions(+), 52 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 38a47a846df7..e0b2d1eb28d1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1604,12 +1604,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: recommend_to_remove=False) return False - # No Embedding Models so far. - if model_config.task not in ["generate"]: - _raise_or_fallback(feature_name=f"--task {model_config.task}", - recommend_to_remove=False) - return False - # No Mamba or Encoder-Decoder so far. if not model_config.is_v1_compatible: _raise_or_fallback(feature_name=model_config.architectures, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 77b2ef0fce5f..7e5f9ea42820 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -18,15 +18,17 @@ from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) -from .interfaces import SupportsCrossEncoding, SupportsV0Only +from .interfaces import SupportsCrossEncoding from .utils import WeightsMapper, maybe_prefix @@ -323,6 +325,7 @@ def __init__(self, add_pooling_layer: bool = False): super().__init__() config = vllm_config.model_config.hf_config + self.config = config self.embeddings = embedding_class(config) self.encoder = BertEncoder(vllm_config=vllm_config, prefix=f"{prefix}.encoder") @@ -340,12 +343,16 @@ def forward( hidden_states = inputs_embeds else: attn_metadata = get_forward_context().attn_metadata - assert hasattr(attn_metadata, "seq_lens_tensor") - hidden_states = self.embeddings( - input_ids=input_ids, - seq_lens=attn_metadata.seq_lens_tensor, - position_ids=position_ids, - token_type_ids=token_type_ids) + seq_lens = None + if attn_metadata is not None: # Can be None during warmup + seq_lens = getattr(attn_metadata, "seq_lens_tensor", + attn_metadata.seq_lens) + assert seq_lens is not None + hidden_states = self.embeddings(input_ids=input_ids, + seq_lens=seq_lens, + position_ids=position_ids, + token_type_ids=token_type_ids) + return self.encoder(hidden_states) def load_weights(self, weights: Iterable[Tuple[str, @@ -385,7 +392,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -class BertEmbeddingModel(nn.Module, SupportsV0Only): +class BertEmbeddingModel(nn.Module): """A model that uses Bert to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for @@ -403,6 +410,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model = self._build_model(vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")) self._pooler = self._build_pooler(pooler_config) + # TODO: Remove test scaffolding after pooling is implemented + self.sampler = get_sampler() def forward( self, @@ -411,10 +420,30 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.model(input_ids=input_ids, - position_ids=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors) + hidden_states = self.model(input_ids=input_ids, + position_ids=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) + return hidden_states + + # TODO: Remove test scaffolding after pooling is implemented + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = torch.zeros( + (hidden_states.shape[0], self.model.config.vocab_size), + dtype=torch.half) + logits[:, 333] = 1.0 + + return logits + + # TODO: Remove test scaffolding after pooling is implemented + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens def pooler( self, diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ba92eef12707..978466937436 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -80,31 +80,33 @@ def forward( input_shape = input_ids.size() inputs_embeds = self.word_embeddings(input_ids) - # Replace position ids because in RoBERTa models - # they have to start at padding_idx + 1 and ignore - # existing padding tokens - # References: - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - pos_list = [] - token_list = [] - offset = 0 - for seq_len in seq_lens: - pos_list.append(position_ids[offset:offset + seq_len]) - token_list.append(input_ids[offset:offset + seq_len]) - offset += seq_len - - new_pos_list = [] - for positions, tokens in zip(pos_list, token_list): - # Verify assumption that incoming position are - # always a sequence from 0 to N. - expected_pos = torch.arange(positions.size()[0], - dtype=torch.long, - device=inputs_embeds.device) - assert torch.equal(positions, expected_pos) - new_pos_list.append( - create_position_ids_from_input_ids(tokens, self.padding_idx)) - position_ids = torch.cat(new_pos_list) + if seq_lens is not None: # Can be None during warmup + # Replace position ids because in RoBERTa models + # they have to start at padding_idx + 1 and ignore + # existing padding tokens + # References: + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 + pos_list = [] + token_list = [] + offset = 0 + for seq_len in seq_lens: + pos_list.append(position_ids[offset:offset + seq_len]) + token_list.append(input_ids[offset:offset + seq_len]) + offset += seq_len + + new_pos_list = [] + for positions, tokens in zip(pos_list, token_list): + # Verify assumption that incoming position are + # always a sequence from 0 to N. + expected_pos = torch.arange(positions.size()[0], + dtype=torch.long, + device=inputs_embeds.device) + assert torch.equal(positions, expected_pos) + new_pos_list.append( + create_position_ids_from_input_ids(tokens, + self.padding_idx)) + position_ids = torch.cat(new_pos_list) # Position embeddings. position_embeddings = self.position_embeddings(position_ids) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 92e4ffd0371a..e20090dee99c 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -198,11 +198,11 @@ def __init__( f"Supported head sizes are: {support_head_sizes}. " "Set VLLM_USE_V1=0 to use another attention backend.") - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " + if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER]: + raise NotImplementedError("Encoder/decoder cross-attention " + "is not implemented for " "FlashAttentionImpl") + self.attn_type = attn_type self.vllm_flash_attn_version = get_flash_attn_version() if is_quantized_kv_cache(self.kv_cache_dtype) \ and not flash_attn_supports_fp8(): @@ -290,7 +290,7 @@ def forward( seqused_k=attn_metadata.seq_lens, max_seqlen_k=attn_metadata.max_seq_len, softmax_scale=self.scale, - causal=True, + causal=_get_causal_option(self.attn_type), alibi_slopes=self.alibi_slopes, window_size=self.sliding_window, block_table=attn_metadata.block_table, @@ -483,3 +483,21 @@ def cascade_attention( # Merge prefix and suffix outputs, and store the result in output. merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse) + + +def _get_causal_option(attn_type: str) -> bool: + """ + Determine whether the given attention type is suitable for causal + attention mechanisms. + + Args: + attn_type (AttentionType): The type of attention being evaluated + + Returns: + bool: Returns `True` if the attention type is suitable for causal + attention (i.e., not encoder, encoder-only, or encoder-decoder), + otherwise returns `False`. + """ + return not (attn_type == AttentionType.ENCODER + or attn_type == AttentionType.ENCODER_ONLY + or attn_type == AttentionType.ENCODER_DECODER) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index e0169f1a4ded..df291c08488d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -374,7 +374,11 @@ def encode( trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, ): - raise ValueError("Not Supported on V1 yet.") + return self.generate(prompt, + SamplingParams.from_optional(temperature=0, + max_tokens=1), + request_id=request_id) + #raise ValueError("Not Supported on V1 yet.") async def get_model_config(self) -> ModelConfig: return self.model_config diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f4bb4583bea4..9a2d3961be18 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -49,8 +49,6 @@ def __init__( executor_class: type[Executor], log_stats: bool, ): - assert vllm_config.model_config.runner_type != "pooling" - logger.info("Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 66358d963d51..6bb8e41f9af0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1570,7 +1570,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: # TODO: Support other attention modules, e.g., sliding window, # cross-attention assert isinstance(attn_module, Attention) - if attn_module.attn_type == AttentionType.DECODER: + if attn_module.attn_type in \ + [AttentionType.DECODER, AttentionType.ENCODER]: kv_cache_spec[layer_name] = FullAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, From acf46382e3de7aef5b5694e862447ab346b7143e Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 3 Apr 2025 09:29:52 -0300 Subject: [PATCH 02/63] hack v1 flash_attn to support encoder_only Signed-off-by: Max de Bayser --- vllm/model_executor/models/bert.py | 7 +- vllm/v1/attention/backends/flash_attn.py | 116 ++++++++++++++--------- vllm/v1/engine/core.py | 47 +++++---- vllm/v1/worker/gpu_model_runner.py | 2 +- 4 files changed, 107 insertions(+), 65 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 7e5f9ea42820..a096a53dd926 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -117,7 +117,7 @@ def forward( self, hidden_states: torch.Tensor, ) -> torch.Tensor: - for layer in self.layer: + for i, layer in enumerate(self.layer): hidden_states = layer(hidden_states) return hidden_states @@ -432,9 +432,11 @@ def compute_logits( hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: + print(f"{hidden_states=}") logits = torch.zeros( (hidden_states.shape[0], self.model.config.vocab_size), - dtype=torch.half) + dtype=torch.half, + device=hidden_states.device) logits[:, 333] = 1.0 return logits @@ -450,6 +452,7 @@ def pooler( hidden_states: torch.Tensor, pooling_metadata: PoolingMetadata, ) -> Optional[PoolerOutput]: + print(f"{hidden_states[-1]=}") return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index e20090dee99c..0c6a1f3374ed 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -198,7 +198,9 @@ def __init__( f"Supported head sizes are: {support_head_sizes}. " "Set VLLM_USE_V1=0 to use another attention backend.") - if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER]: + if attn_type not in [ + AttentionType.DECODER, AttentionType.ENCODER_ONLY + ]: raise NotImplementedError("Encoder/decoder cross-attention " "is not implemented for " "FlashAttentionImpl") @@ -254,52 +256,80 @@ def forward( # not padded. However, we don't need to do key[:num_actual_tokens] and # value[:num_actual_tokens] because the reshape_and_cache_flash op uses # the slot_mapping's shape to determine the number of actual tokens. - key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) - descale_shape = (attn_metadata.query_start_loc.shape[0] - 1, - key.shape[1]) - if self.kv_cache_dtype.startswith("fp8"): - key_cache = key_cache.view(torch.float8_e4m3fn) - value_cache = value_cache.view(torch.float8_e4m3fn) - num_tokens, num_heads, head_size = query.shape - query, _ = ops.scaled_fp8_quant( - query.reshape( - (num_tokens, num_heads * head_size)).contiguous(), - layer._q_scale) - query = query.reshape((num_tokens, num_heads, head_size)) + if kv_cache.numel() > 0: + key_cache, value_cache = kv_cache.unbind(0) + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + if self.kv_cache_dtype.startswith("fp8"): + key_cache = key_cache.view(torch.float8_e4m3fn) + value_cache = value_cache.view(torch.float8_e4m3fn) + num_tokens, num_heads, head_size = query.shape + query, _ = ops.scaled_fp8_quant( + query.reshape( + (num_tokens, num_heads * head_size)).contiguous(), + layer._q_scale) + query = query.reshape((num_tokens, num_heads, head_size)) + else: + key_cache = key + value_cache = value # Compute attention and update output up to `num_actual_tokens`. if not attn_metadata.use_cascade: # Regular attention (common case). - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, - max_seqlen_q=attn_metadata.max_query_len, - seqused_k=attn_metadata.seq_lens, - max_seqlen_k=attn_metadata.max_seq_len, - softmax_scale=self.scale, - causal=_get_causal_option(self.attn_type), - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=attn_metadata.block_table, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) + + descale_shape = (attn_metadata.query_start_loc.shape[0] - 1, + key.shape[1]) + if kv_cache.numel() > 0: + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + seqused_k=attn_metadata.seq_lens, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=self.scale, + causal=_get_causal_option(self.attn_type), + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=attn_metadata.block_table, + softcap=self.logits_soft_cap, + fa_version=self.vllm_flash_attn_version, + q_descale=layer._q_scale.expand(descale_shape), + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) + else: + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key, + v=value, + out=output[:num_actual_tokens], + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + #seqused_k=attn_metadata.seq_lens, + cu_seqlens_k=attn_metadata.query_start_loc, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=self.scale, + causal=_get_causal_option(self.attn_type), + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + softcap=self.logits_soft_cap, + fa_version=self.vllm_flash_attn_version, + q_descale=layer._q_scale.expand(descale_shape), + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) return output # Cascade attention (rare case). diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9a2d3961be18..efe0130145cc 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -29,6 +29,7 @@ EngineCoreRequestType, UtilityOutput) from vllm.v1.engine.mm_input_cache import MMInputCacheServer from vllm.v1.executor.abstract import Executor +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder @@ -121,25 +122,33 @@ def _initialize_kv_caches(self, assert len(kv_cache_specs) == len(available_gpu_memory) # Get the kv cache tensor size - kv_cache_configs = [ - get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, - available_gpu_memory_one_worker) - for kv_cache_spec_one_worker, available_gpu_memory_one_worker in - zip(kv_cache_specs, available_gpu_memory) - ] - - # Since we use a shared centralized controller, we need the - # `kv_cache_config` to be consistent across all workers to make sure - # all the memory operators can be applied to all workers. - unify_kv_cache_configs(kv_cache_configs) - - # All workers have the same kv_cache_config except layer names, so use - # an arbitrary one to get the number of blocks. - assert all([ - cfg.num_blocks == kv_cache_configs[0].num_blocks - for cfg in kv_cache_configs - ]) - num_gpu_blocks = kv_cache_configs[0].num_blocks + if any(kv_cache_specs): + kv_cache_configs = [ + get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, + available_gpu_memory_one_worker) + for kv_cache_spec_one_worker, available_gpu_memory_one_worker + in zip(kv_cache_specs, available_gpu_memory) + ] + # Since we use a shared centralized controller, we need the + # `kv_cache_config` to be consistent across all workers to make sure + # all the memory operators can be applied to all workers. + unify_kv_cache_configs(kv_cache_configs) + # All workers have the same kv_cache_config except layer names, + # so use an arbitrary one to get the number of blocks. + assert all([ + cfg.num_blocks == kv_cache_configs[0].num_blocks + for cfg in kv_cache_configs + ]) + num_gpu_blocks = kv_cache_configs[0].num_blocks + else: + kv_cache_configs = [] + kv_cache_configs = [ + KVCacheConfig(num_blocks=1, tensors={}, kv_cache_groups=[]) + for kv_cache_spec_one_worker in kv_cache_specs + ] + + num_gpu_blocks = 1 + num_cpu_blocks = 0 # Initialize kv cache and warmup the execution diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6bb8e41f9af0..1c14c5892a0b 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1571,7 +1571,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: # cross-attention assert isinstance(attn_module, Attention) if attn_module.attn_type in \ - [AttentionType.DECODER, AttentionType.ENCODER]: + [AttentionType.DECODER]: kv_cache_spec[layer_name] = FullAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, From 8debea02477fbe5175fead686e2e44ea62a6d88f Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 3 Apr 2025 12:38:04 -0300 Subject: [PATCH 03/63] Revert changes to disable kv caching for encoder-only models Encoder-only models can also benefit from the prefix caching that is enabled by the kv cache Signed-off-by: Max de Bayser --- vllm/v1/attention/backends/flash_attn.py | 106 +++++++++-------------- vllm/v1/engine/core.py | 2 - vllm/v1/worker/gpu_model_runner.py | 9 +- 3 files changed, 45 insertions(+), 72 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 0c6a1f3374ed..318178b064d3 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -256,31 +256,27 @@ def forward( # not padded. However, we don't need to do key[:num_actual_tokens] and # value[:num_actual_tokens] because the reshape_and_cache_flash op uses # the slot_mapping's shape to determine the number of actual tokens. - if kv_cache.numel() > 0: - key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + key_cache, value_cache = kv_cache.unbind(0) + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) - if self.kv_cache_dtype.startswith("fp8"): - key_cache = key_cache.view(torch.float8_e4m3fn) - value_cache = value_cache.view(torch.float8_e4m3fn) - num_tokens, num_heads, head_size = query.shape - query, _ = ops.scaled_fp8_quant( - query.reshape( - (num_tokens, num_heads * head_size)).contiguous(), - layer._q_scale) - query = query.reshape((num_tokens, num_heads, head_size)) - else: - key_cache = key - value_cache = value + if self.kv_cache_dtype.startswith("fp8"): + key_cache = key_cache.view(torch.float8_e4m3fn) + value_cache = value_cache.view(torch.float8_e4m3fn) + num_tokens, num_heads, head_size = query.shape + query, _ = ops.scaled_fp8_quant( + query.reshape( + (num_tokens, num_heads * head_size)).contiguous(), + layer._q_scale) + query = query.reshape((num_tokens, num_heads, head_size)) # Compute attention and update output up to `num_actual_tokens`. if not attn_metadata.use_cascade: @@ -288,48 +284,26 @@ def forward( descale_shape = (attn_metadata.query_start_loc.shape[0] - 1, key.shape[1]) - if kv_cache.numel() > 0: - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, - max_seqlen_q=attn_metadata.max_query_len, - seqused_k=attn_metadata.seq_lens, - max_seqlen_k=attn_metadata.max_seq_len, - softmax_scale=self.scale, - causal=_get_causal_option(self.attn_type), - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=attn_metadata.block_table, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) - else: - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key, - v=value, - out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, - max_seqlen_q=attn_metadata.max_query_len, - #seqused_k=attn_metadata.seq_lens, - cu_seqlens_k=attn_metadata.query_start_loc, - max_seqlen_k=attn_metadata.max_seq_len, - softmax_scale=self.scale, - causal=_get_causal_option(self.attn_type), - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - softcap=self.logits_soft_cap, - fa_version=self.vllm_flash_attn_version, - q_descale=layer._q_scale.expand(descale_shape), - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + seqused_k=attn_metadata.seq_lens, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=self.scale, + causal=_get_causal_option(self.attn_type), + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=attn_metadata.block_table, + softcap=self.logits_soft_cap, + fa_version=self.vllm_flash_attn_version, + q_descale=layer._q_scale.expand(descale_shape), + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) return output # Cascade attention (rare case). diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 7207d4beb94a..aab0436030d7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -56,8 +56,6 @@ def __init__( executor_class: type[Executor], log_stats: bool, ): - assert vllm_config.model_config.runner_type != "pooling" - logger.info("Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 513806332efe..f050732ec5b0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1677,7 +1677,9 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: # TODO: Support other attention modules, e.g., sliding window, # cross-attention assert isinstance(attn_module, Attention) - if attn_module.attn_type == AttentionType.DECODER: + # encoder only can also benefit from KV cache for prefix caching + if attn_module.attn_type in (AttentionType.DECODER, + AttentionType.ENCODER_ONLY): if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, @@ -1693,9 +1695,8 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: head_size=attn_module.head_size, dtype=self.kv_cache_dtype, use_mla=use_mla) - elif attn_module.attn_type in (AttentionType.ENCODER, - AttentionType.ENCODER_ONLY): - # encoder-only attention does not need KV cache. + elif attn_module.attn_type == AttentionType.ENCODER: + # encoder attention does not need KV cache. continue elif attn_module.attn_type == AttentionType.ENCODER_DECODER: raise NotImplementedError From 8d97b9cdcca3101a4166d918d7419b93bf497b07 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Sat, 5 Apr 2025 12:51:24 -0300 Subject: [PATCH 04/63] Add pooling support in v1 This is only passing mypy, it hasn't been tested yet Signed-off-by: Max de Bayser --- vllm/model_executor/layers/pooler.py | 45 ++++++-- vllm/outputs.py | 9 +- vllm/v1/core/kv_cache_manager.py | 3 +- vllm/v1/core/sched/output.py | 5 +- vllm/v1/core/sched/scheduler.py | 3 +- vllm/v1/core/sched/utils.py | 18 +-- vllm/v1/engine/__init__.py | 7 +- vllm/v1/engine/async_llm.py | 62 ++++++++-- vllm/v1/engine/detokenizer.py | 2 + vllm/v1/engine/llm_engine.py | 8 +- vllm/v1/engine/logprobs.py | 1 + vllm/v1/engine/output_processor.py | 102 +++++++++++------ vllm/v1/engine/processor.py | 35 +++--- vllm/v1/outputs.py | 3 + vllm/v1/pool/__init__.py | 0 vllm/v1/pool/metadata.py | 13 +++ vllm/v1/request.py | 28 +++-- vllm/v1/sample/metadata.py | 7 +- vllm/v1/structured_output/__init__.py | 1 + vllm/v1/worker/gpu_input_batch.py | 159 ++++++++++++++------------ vllm/v1/worker/gpu_model_runner.py | 28 ++++- vllm/v1/worker/tpu_model_runner.py | 4 + 22 files changed, 379 insertions(+), 164 deletions(-) create mode 100644 vllm/v1/pool/__init__.py create mode 100644 vllm/v1/pool/metadata.py diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 0012636ef9ff..8e9155c17f9d 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -10,11 +10,15 @@ from typing_extensions import assert_never from vllm.config import PoolerConfig -from vllm.model_executor.pooling_metadata import (PoolingMetadata, - PoolingTensors) +from vllm.model_executor.pooling_metadata import ( # noqa: E501 + PoolingMetadata as V0PoolingMetadata) +from vllm.model_executor.pooling_metadata import PoolingTensors from vllm.sequence import PoolerOutput, PoolingSequenceGroupOutput from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) +from vllm.v1.pool.metadata import PoolingMetadata as V1PoolingMetadata + +PoolingMetadata = Union[V0PoolingMetadata, V1PoolingMetadata] class PoolingType(IntEnum): @@ -78,6 +82,8 @@ def get_prompt_lens( hidden_states: torch.Tensor, pooling_metadata: PoolingMetadata, ) -> torch.Tensor: + if isinstance(pooling_metadata, V1PoolingMetadata): + return pooling_metadata.prompt_lens return PoolingTensors.from_pooling_metadata( pooling_metadata, hidden_states.device).prompt_lens @@ -181,12 +187,27 @@ def __init__( self.step_tag_id = step_tag_id self.returned_token_ids = returned_token_ids + def get_prompt_token_ids( + self, + pooling_metadata: PoolingMetadata, + ) -> List[torch.Tensor]: + if isinstance(pooling_metadata, V1PoolingMetadata): + return [ + pooling_metadata.prompt_token_ids[i, :num] + for i, num in enumerate(pooling_metadata.prompt_lens) + ] + return [ + seq_data_i.prompt_token_ids + for seq_data_i in pooling_metadata.seq_data.values() + ] + def extract_states( self, hidden_states: torch.Tensor, pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + prompt_token_ids = self.get_prompt_token_ids(pooling_metadata) returned_token_ids = self.returned_token_ids if returned_token_ids is not None and len(returned_token_ids) > 0: @@ -196,12 +217,11 @@ def extract_states( offset = 0 pooled_data = list[torch.Tensor]() - for prompt_len, seq_data_i in zip(prompt_lens, - pooling_metadata.seq_data.values()): + for i, prompt_len in enumerate(prompt_lens): pooled_data_i = hidden_states[offset:offset + prompt_len] if step_tag_id is not None: - token_ids = torch.tensor(seq_data_i.prompt_token_ids) - pooled_data_i = pooled_data_i[token_ids == step_tag_id] + pooled_data_i = pooled_data_i[prompt_token_ids[i] == + step_tag_id] offset += prompt_len pooled_data.append(pooled_data_i) @@ -287,6 +307,16 @@ def __init__( self.default_activation_function = \ get_cross_encoder_activation_function(config) + def get_prompt_lens( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> torch.Tensor: + if isinstance(pooling_metadata, V1PoolingMetadata): + return pooling_metadata.prompt_lens + return PoolingTensors.from_pooling_metadata( + pooling_metadata, hidden_states.device).prompt_lens + def forward( self, hidden_states: torch.Tensor, @@ -294,8 +324,7 @@ def forward( ) -> PoolerOutput: """Pools sentence pair scores from the hidden_states.""" - prompt_lens = PoolingTensors.from_pooling_metadata( - pooling_metadata, hidden_states.device).prompt_lens + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) offset = 0 pooled_data_lst = [] diff --git a/vllm/outputs.py b/vllm/outputs.py index 014e8d5d8823..44266591187d 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -344,10 +344,11 @@ class PoolingRequestOutput(Generic[_O]): finished (bool): A flag indicating whether the pooling is completed. """ - def __init__(self, request_id: str, outputs: _O, + def __init__(self, request_id: str, outputs: _O, prompt: Optional[str], prompt_token_ids: list[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids + self.prompt = prompt self.finished = finished self.outputs = outputs @@ -359,9 +360,10 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": data = pooled_data.to(dtype=torch.float32, device="cpu") output = PoolingOutput(data) prompt_token_ids = seq_group.prompt_token_ids + prompt = seq_group.prompt finished = seq_group.is_finished() - return PoolingRequestOutput(seq_group.request_id, output, + return PoolingRequestOutput(seq_group.request_id, output, prompt, prompt_token_ids, finished) def __repr__(self): @@ -426,6 +428,7 @@ def from_base(request_output: PoolingRequestOutput): return EmbeddingRequestOutput( request_id=request_output.request_id, outputs=EmbeddingOutput.from_base(request_output.outputs), + prompt=request_output.prompt, prompt_token_ids=request_output.prompt_token_ids, finished=request_output.finished, ) @@ -464,6 +467,7 @@ def from_base(request_output: PoolingRequestOutput): return ClassificationRequestOutput( request_id=request_output.request_id, outputs=ClassificationOutput.from_base(request_output.outputs), + prompt=request_output.prompt, prompt_token_ids=request_output.prompt_token_ids, finished=request_output.finished, ) @@ -503,6 +507,7 @@ def from_base(request_output: PoolingRequestOutput): return ScoringRequestOutput( request_id=request_output.request_id, outputs=ScoringOutput.from_base(request_output.outputs), + prompt=request_output.prompt, prompt_token_ids=request_output.prompt_token_ids, finished=request_output.finished, ) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index c0f7715209d1..4a8a2c2e9005 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -126,7 +126,8 @@ def get_computed_blocks( self.req_to_block_hashes[request.request_id] = block_hashes self.prefix_cache_stats.requests += 1 - if request.sampling_params.prompt_logprobs is None: + if request.sampling_params and \ + request.sampling_params.prompt_logprobs is None: if len(block_hashes) * self.block_size == request.num_tokens: # When prompt length is divisible by the block size and all # blocks are cached, we need to recompute the last token. This diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index dc0d2d59fea7..b8895be0dbc4 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -11,6 +11,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange + from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.request import Request @@ -24,7 +25,8 @@ class NewRequestData: mm_inputs: list[MultiModalKwargs] mm_hashes: list[str] mm_positions: list[PlaceholderRange] - sampling_params: SamplingParams + sampling_params: Optional[SamplingParams] + pooling_params: Optional[PoolingParams] block_ids: list[int] num_computed_tokens: int lora_request: Optional[LoRARequest] @@ -43,6 +45,7 @@ def from_request( mm_hashes=request.mm_hashes, mm_positions=request.mm_positions, sampling_params=request.sampling_params, + pooling_params=request.pooling_params, block_ids=block_ids, num_computed_tokens=request.num_computed_tokens, lora_request=request.lora_request, diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index a0865c8fd845..58a020ab42e1 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -628,7 +628,8 @@ def update_from_output( break # Extract sample logprobs if needed. - if request.sampling_params.logprobs is not None and logprobs: + if request.sampling_params \ + and request.sampling_params.logprobs is not None and logprobs: # NOTE: once we support N tokens per step (spec decode), # the outer lists can be of length > 1. new_logprobs = logprobs.slice(req_index, req_index + 1) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 3a0028a59016..e9cc320ee236 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -8,15 +8,15 @@ def check_stop(request: Request, max_model_len: int) -> bool: request.status = RequestStatus.FINISHED_LENGTH_CAPPED return True - sampling_params = request.sampling_params last_token_id = request.output_token_ids[-1] - if (not sampling_params.ignore_eos - and last_token_id == request.eos_token_id): - request.status = RequestStatus.FINISHED_STOPPED - return True + if (sampling_params := request.sampling_params) is not None: + if (not sampling_params.ignore_eos + and last_token_id == request.eos_token_id): + request.status = RequestStatus.FINISHED_STOPPED + return True - if last_token_id in (sampling_params.stop_token_ids or ()): - request.status = RequestStatus.FINISHED_STOPPED - request.stop_reason = last_token_id - return True + if last_token_id in (sampling_params.stop_token_ids or ()): + request.status = RequestStatus.FINISHED_STOPPED + request.stop_reason = last_token_id + return True return False diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 0557d0c6c19d..f9ec79db4457 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -5,10 +5,12 @@ from typing import Any, Optional, Union import msgspec +import torch from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs from vllm.multimodal.inputs import PlaceholderRange +from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import LogprobsLists, LogprobsTensors @@ -55,7 +57,8 @@ class EngineCoreRequest( mm_inputs: Optional[list[MultiModalKwargs]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] - sampling_params: SamplingParams + sampling_params: Optional[SamplingParams] + pooling_params: Optional[PoolingParams] eos_token_id: Optional[int] arrival_time: float lora_request: Optional[LoRARequest] @@ -98,6 +101,8 @@ class EngineCoreOutput( new_logprobs: Optional[LogprobsLists] = None new_prompt_logprobs_tensors: Optional[LogprobsTensors] = None + pooling_output: Optional[torch.Tensor] = None + finish_reason: Optional[FinishReason] = None stop_reason: Union[int, str, None] = None events: Optional[list[EngineCoreEvent]] = None diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3543d6a01ff8..62163ebb410e 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -19,7 +19,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.outputs import RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -368,7 +368,7 @@ def _record_stats( stat_logger.record(scheduler_stats=scheduler_stats, iteration_stats=iteration_stats) - def encode( + async def encode( self, prompt: PromptType, pooling_params: PoolingParams, @@ -376,12 +376,58 @@ def encode( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, priority: int = 0, - ): - return self.generate(prompt, - SamplingParams.from_optional(temperature=0, - max_tokens=1), - request_id=request_id) - #raise ValueError("Not Supported on V1 yet.") + ) -> AsyncGenerator[PoolingRequestOutput, None]: + """ + Main function called by the API server to kick off a request + * 1) Making an AsyncStream corresponding to the Request. + * 2) Processing the Input. + * 3) Adding the Request to the Detokenizer. + * 4) Adding the Request to the EngineCore (separate process). + + A separate output_handler loop runs in a background AsyncIO task, + pulling outputs from EngineCore and putting them into the + per-request AsyncStream. + + The caller of generate() iterates the returned AsyncGenerator, + returning the RequestOutput back to the caller. + """ + + try: + # We start the output_handler on the first call to generate() so + # we can call __init__ before the event loop, which enables us + # to handle startup failure gracefully in the OpenAI server. + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) + + q = await self.add_request( + request_id, + prompt, + pooling_params, + lora_request=lora_request, + trace_headers=trace_headers, + priority=priority, + ) + + # The output_handler task pushes items into the queue. + # This task pulls from the queue and yields to caller. + finished = False + while not finished: + # Note: drain queue without await if possible (avoids + # task switching under load which helps performance). + out = q.get_nowait() or await q.get() + assert type(out) is PoolingRequestOutput + # Note: both OutputProcessor and EngineCore handle their + # own request cleanup based on finished. + finished = out.finished + yield out + + # If the request is disconnected by the client, the + # generate() task will be canceled. So, we abort the + # request if we end up here. + except asyncio.CancelledError: + await self.abort(request_id) + raise async def get_model_config(self) -> ModelConfig: return self.model_config diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index bf06a17507b2..50db42034e23 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -53,6 +53,8 @@ def from_new_request( request: EngineCoreRequest, ) -> "IncrementalDetokenizer": + assert request.sampling_params is not None + if tokenizer is None: return cls(token_ids=[]) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c67186f7040..d905e57aca35 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -2,7 +2,7 @@ from collections.abc import Mapping from copy import copy -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Optional, Union, cast from typing_extensions import TypeVar @@ -15,7 +15,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.outputs import RequestOutput +from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams @@ -199,7 +199,7 @@ def add_request( return # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) + parent_req = ParentRequest(request_id, cast(SamplingParams, params)) for idx in range(n): request_id, params = parent_req.get_child_info(idx) child_request = request if idx == n - 1 else copy(request) @@ -211,7 +211,7 @@ def add_request( # Add the request to EngineCore. self.engine_core.add_request(child_request) - def step(self) -> list[RequestOutput]: + def step(self) -> list[RequestOutput | PoolingRequestOutput]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 03d82b6bbc1d..283479311814 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -37,6 +37,7 @@ def from_new_request( tokenizer: Optional[AnyTokenizer], request: EngineCoreRequest, ) -> "LogprobsProcessor": + assert request.sampling_params is not None num_logprobs = request.sampling_params.logprobs num_prompt_logprobs = request.sampling_params.prompt_logprobs return cls( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 70f072d3c939..1f73ddcaedde 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -3,10 +3,13 @@ import asyncio from collections.abc import Iterable from dataclasses import dataclass -from typing import Optional, Union +from typing import Optional, Union, cast -from vllm.outputs import CompletionOutput, RequestOutput -from vllm.sampling_params import RequestOutputKind +import torch + +from vllm.outputs import (CompletionOutput, PoolingOutput, + PoolingRequestOutput, RequestOutput) +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason @@ -28,28 +31,30 @@ class RequestOutputCollector: def __init__(self, output_kind: RequestOutputKind): self.aggregate = output_kind == RequestOutputKind.DELTA - self.output: Optional[RequestOutput] = None + self.output: Optional[Union[RequestOutput, + PoolingRequestOutput]] = None self.ready = asyncio.Event() - def put(self, output: RequestOutput) -> None: + def put(self, output: Union[RequestOutput, PoolingRequestOutput]) -> None: if self.output is None: self.output = output self.ready.set() elif self.aggregate: # Coalesce the outputs in delta case. - self.output.add(output) + cast(RequestOutput, self.output).add(cast(RequestOutput, output)) else: # Just replace latest in non-delta case. self.output = output - async def get(self) -> RequestOutput: + async def get(self) -> Union[RequestOutput, PoolingRequestOutput]: while (output := self.output) is None: await self.ready.wait() self.output = None self.ready.clear() return output - def get_nowait(self) -> Optional[RequestOutput]: + def get_nowait( + self) -> Optional[Union[RequestOutput, PoolingRequestOutput]]: output = self.output if output is not None: self.output = None @@ -60,7 +65,7 @@ def get_nowait(self) -> Optional[RequestOutput]: @dataclass class OutputProcessorOutput: - request_outputs: list[RequestOutput] + request_outputs: list[Union[RequestOutput, PoolingRequestOutput]] reqs_to_abort: list[str] @@ -109,15 +114,20 @@ def from_new_request( queue: Optional[RequestOutputCollector], log_stats: bool, ) -> "RequestState": - if not request.sampling_params.detokenize: + + sampling_params = request.sampling_params \ + if request.sampling_params \ + else SamplingParams.from_optional() + if not sampling_params.detokenize: tokenizer = None + return cls( request_id=request.request_id, parent_req=parent_req, request_index=request_index, lora_name=(request.lora_request.name if request.lora_request is not None else None), - output_kind=request.sampling_params.output_kind, + output_kind=sampling_params.output_kind, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, logprobs_processor=LogprobsProcessor.from_new_request( @@ -128,8 +138,8 @@ def from_new_request( tokenizer=tokenizer, request=request, ), - max_tokens_param=(request.sampling_params.max_tokens if - request.sampling_params is not None else None), + max_tokens_param=(sampling_params.max_tokens + if sampling_params is not None else None), arrival_time=request.arrival_time, queue=queue, log_stats=log_stats, @@ -138,9 +148,10 @@ def from_new_request( def make_request_output( self, new_token_ids: list[int], + pooling_output: Optional[torch.Tensor], finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], - ) -> Optional[RequestOutput]: + ) -> Optional[Union[RequestOutput | PoolingRequestOutput]]: finished = finish_reason is not None final_only = self.output_kind == RequestOutputKind.FINAL_ONLY @@ -149,15 +160,20 @@ def make_request_output( # Only the final output is required in FINAL_ONLY mode. return None - completion_output = self._new_completion_output( - new_token_ids, finish_reason, stop_reason) - request_id = self.request_id + if pooling_output: + return self._new_request_output( + request_id, [self._new_pooling_output(pooling_output)], + finished) + + output = self._new_completion_output(new_token_ids, finish_reason, + stop_reason) + if self.parent_req is None: - outputs = [completion_output] + outputs = [output] else: request_id, outputs, finished = self.parent_req.get_outputs( - request_id, completion_output) + request_id, output) if not outputs: return None @@ -166,10 +182,19 @@ def make_request_output( def _new_request_output( self, request_id: str, - outputs: list[CompletionOutput], + outputs: Union[list[CompletionOutput], list[PoolingOutput]], finished: bool, - ) -> RequestOutput: - + ) -> RequestOutput | PoolingRequestOutput: + + if isinstance(outputs[0], PoolingOutput): + assert len(outputs) == 1 + return PoolingRequestOutput( + request_id=request_id, + outputs=outputs[0], + prompt=self.prompt, + prompt_token_ids=self.prompt_token_ids, + finished=finished, + ) if self.output_kind == RequestOutputKind.DELTA: # Side effect: logprobs processor forgets prompt logprobs prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs() @@ -181,7 +206,7 @@ def _new_request_output( prompt=self.prompt, prompt_token_ids=self.prompt_token_ids, prompt_logprobs=prompt_logprobs, - outputs=outputs, + outputs=cast(list[CompletionOutput], outputs), finished=finished, ) @@ -214,6 +239,13 @@ def _new_completion_output( finish_reason=str(finish_reason) if finished else None, stop_reason=stop_reason if finished else None) + def _new_pooling_output( + self, + pooling_output: torch.Tensor, + ) -> PoolingOutput: + + return PoolingOutput(data=pooling_output) + class OutputProcessor: """Process EngineCoreOutputs into RequestOutputs.""" @@ -305,7 +337,7 @@ def process_outputs( ********************************************************** """ - request_outputs: list[RequestOutput] = [] + request_outputs: list[RequestOutput | PoolingRequestOutput] = [] reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id @@ -320,24 +352,28 @@ def process_outputs( iteration_stats) new_token_ids = engine_core_output.new_token_ids + pooling_output = engine_core_output.pooling_output finish_reason = engine_core_output.finish_reason stop_reason = engine_core_output.stop_reason req_state.is_prefilling = False - # 2) Detokenize the token ids into text and perform stop checks. - stop_string = req_state.detokenizer.update( - new_token_ids, finish_reason == FinishReason.STOP) - if stop_string: - finish_reason = FinishReason.STOP - stop_reason = stop_string + if not pooling_output: + # 2) Detokenize the token ids into text and perform stop checks. + stop_string = req_state.detokenizer.update( + new_token_ids, finish_reason == FinishReason.STOP) + if stop_string: + finish_reason = FinishReason.STOP + stop_reason = stop_string - # 3) Compute sample and prompt logprobs for request, if required. - req_state.logprobs_processor.update_from_output(engine_core_output) + # 3) Compute sample and prompt logprobs for request, + # if required. + req_state.logprobs_processor.update_from_output( + engine_core_output) # 4) Create and handle RequestOutput objects. if request_output := req_state.make_request_output( - new_token_ids, finish_reason, stop_reason): + new_token_ids, pooling_output, finish_reason, stop_reason): if req_state.queue is not None: # AsyncLLM: put into queue for handling by generate(). req_state.queue.put(request_output) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 0d2892837eb2..2017c69c5dac 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -104,8 +104,8 @@ def _validate_params( Should raise ValueError if unsupported for API Server. """ - if not isinstance(params, SamplingParams): - raise ValueError("V1 does not yet support Pooling models.") + if isinstance(params, PoolingParams): + return self._validate_logprobs(params) self._validate_sampling_params(params) @@ -214,19 +214,6 @@ def process_inputs( if encoder_inputs is not None: raise NotImplementedError - assert isinstance(params, SamplingParams) - # TODO: can we avoid cloning here in multiproc case? - sampling_params = params.clone() - # If unset max tokens, then generate up to the max_model_len. - if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) - # Multimodal related. sorted_mm_inputs: Optional[list[MultiModalKwargs]] = None sorted_mm_positions: Optional[list[PlaceholderRange]] = None @@ -267,6 +254,23 @@ def process_inputs( decoder_mm_inputs.get_items(sorted_item_modalities[0]) ] + sampling_params = None + pooling_params = None + if isinstance(params, SamplingParams): + # TODO: can we avoid cloning here in multiproc case? + sampling_params = params.clone() + # If unset max tokens, then generate up to the max_model_len. + if sampling_params.max_tokens is None: + sampling_params.max_tokens = ( + self.model_config.max_model_len - + len(decoder_inputs["prompt_token_ids"])) + sampling_params.update_from_generation_config( + self.generation_config_fields, eos_token_id) + sampling_params.update_from_tokenizer( + self.tokenizer.get_lora_tokenizer(lora_request)) + else: + pooling_params = params.clone() + return EngineCoreRequest( request_id=request_id, prompt=decoder_inputs.get("prompt"), @@ -275,6 +279,7 @@ def process_inputs( mm_hashes=sorted_mm_hashes, mm_placeholders=sorted_mm_positions, sampling_params=sampling_params, + pooling_params=pooling_params, eos_token_id=eos_token_id, arrival_time=arrival_time, lora_request=lora_request, diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 2732b933c28a..0ebd2128d473 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -100,6 +100,8 @@ class ModelRunnerOutput: # [prompt_len] prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] + pooler_output: list[torch.Tensor] + EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput( req_ids=[], @@ -108,4 +110,5 @@ class ModelRunnerOutput: spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) diff --git a/vllm/v1/pool/__init__.py b/vllm/v1/pool/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py new file mode 100644 index 000000000000..51eb1d97f02d --- /dev/null +++ b/vllm/v1/pool/metadata.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: Apache-2.0 +from dataclasses import dataclass +from typing import Optional + +import torch + + +@dataclass +class PoolingMetadata: + """Tensors for pooling.""" + + prompt_lens: torch.Tensor + prompt_token_ids: Optional[torch.Tensor] diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 490fe4e83d3a..711b3b373739 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -3,6 +3,7 @@ import enum from typing import TYPE_CHECKING, Optional, Union +from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams from vllm.v1.engine import (EngineCoreEvent, EngineCoreEventType, EngineCoreRequest, FinishReason) @@ -26,7 +27,8 @@ def __init__( multi_modal_inputs: Optional[list["MultiModalKwargs"]], multi_modal_hashes: Optional[list[str]], multi_modal_placeholders: Optional[list["PlaceholderRange"]], - sampling_params: SamplingParams, + sampling_params: Optional[SamplingParams], + pooling_params: Optional[PoolingParams], eos_token_id: Optional[int], arrival_time: float, lora_request: Optional["LoRARequest"] = None, @@ -34,18 +36,25 @@ def __init__( ) -> None: self.request_id = request_id self.sampling_params = sampling_params + self.pooling_params = pooling_params # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id self.lora_request = lora_request self.structured_output_request = structured_output_request - self.status = (RequestStatus.WAITING_FOR_FSM - if sampling_params.guided_decoding is not None else - RequestStatus.WAITING) + self.status = RequestStatus.WAITING self.events: list[EngineCoreEvent] = [] self.stop_reason: Union[int, str, None] = None - assert sampling_params.max_tokens is not None - self.max_tokens = sampling_params.max_tokens + if pooling_params is not None: + self.max_tokens = 1 + elif sampling_params is not None: + assert sampling_params.max_tokens is not None + self.max_tokens = sampling_params.max_tokens + if sampling_params.guided_decoding is not None: + self.status = RequestStatus.WAITING_FOR_FSM + else: + raise ValueError( + "sampling_params and pooling_params can't both be set") self.prompt = prompt self.prompt_token_ids = prompt_token_ids @@ -83,11 +92,13 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, sampling_params=request.sampling_params, + pooling_params=request.pooling_params, eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, lora_request=request.lora_request, structured_output_request=StructuredOutputRequest( - sampling_params=request.sampling_params), + sampling_params=request.sampling_params) \ + if request.sampling_params else None, ) def append_output_token_ids( @@ -126,7 +137,8 @@ def get_num_encoder_tokens(self, input_id: int) -> int: @property def use_structured_output(self) -> bool: - return self.sampling_params.guided_decoding is not None + return self.sampling_params is not None and \ + self.sampling_params.guided_decoding is not None def record_event( self, diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index e97e1235fb36..c8f9c800f0f1 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Optional +from typing import Any, Optional import torch @@ -41,3 +41,8 @@ class SamplingMetadata: # req_index -> bad_words_token_ids bad_words_token_ids: dict[int, list[list[int]]] + + +@dataclass +class PoolingMetadata: + additional_data: Optional[Any] = None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 218af43deb67..38a6b2474c7f 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -45,6 +45,7 @@ def grammar_init(self, request: Request) -> None: # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). if self.backend is None: + assert request.sampling_params is not None backend_name = request.sampling_params.guided_decoding.backend_name if backend_name == "xgrammar": from vllm.v1.structured_output.backend_xgrammar import ( diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index a64cb97e0123..b53229752525 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -9,9 +9,11 @@ from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange +from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values from vllm.v1.outputs import LogprobsTensors +from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import copy_slice from vllm.v1.worker.block_table import BlockTable @@ -27,7 +29,8 @@ class CachedRequestState: prompt: Optional[str] mm_inputs: list[MultiModalKwargs] mm_positions: list[PlaceholderRange] - sampling_params: SamplingParams + sampling_params: Optional[SamplingParams] + pooling_params: Optional[PoolingParams] generator: Optional[torch.Generator] block_ids: list[int] @@ -223,6 +226,8 @@ def __init__( # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() + self.pooling_reqs: set[str] = set() + @property def req_ids(self) -> list[str]: # None elements should only be present transiently @@ -266,77 +271,82 @@ def add_request( self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens self.block_table.add_row(request.block_ids, req_index) - sampling_params = request.sampling_params - if sampling_params.sampling_type == SamplingType.GREEDY: - # Avoid later division by zero. - self.temperature_cpu[req_index] = -1.0 - self.greedy_reqs.add(req_id) - else: - self.temperature_cpu[req_index] = sampling_params.temperature - self.random_reqs.add(req_id) - - self.top_p_cpu[req_index] = sampling_params.top_p - if sampling_params.top_p < 1: - self.top_p_reqs.add(req_id) - top_k = sampling_params.top_k - if 0 < top_k < self.vocab_size: - self.top_k_reqs.add(req_id) - else: - top_k = self.vocab_size - self.top_k_cpu[req_index] = top_k - self.min_p_cpu[req_index] = sampling_params.min_p - self.frequency_penalties_cpu[ - req_index] = sampling_params.frequency_penalty - if sampling_params.min_p > _SAMPLING_EPS: - self.min_p_reqs.add(req_id) - if sampling_params.frequency_penalty != 0.0: - self.frequency_penalties_reqs.add(req_id) - self.presence_penalties_cpu[ - req_index] = sampling_params.presence_penalty - if sampling_params.presence_penalty != 0.0: - self.presence_penalties_reqs.add(req_id) - self.repetition_penalties_cpu[ - req_index] = sampling_params.repetition_penalty - if sampling_params.repetition_penalty != 1.0: - self.repetition_penalties_reqs.add(req_id) - if sampling_params.min_tokens: - self.min_tokens[req_index] = (sampling_params.min_tokens, - sampling_params.all_stop_token_ids) - - # NOTE(woosuk): self.generators should not include the requests that - # do not have their own generator. - if request.generator is not None: - self.generators[req_index] = request.generator - - if sampling_params.logprobs is not None: - self.num_logprobs[req_id] = sampling_params.logprobs - if sampling_params.prompt_logprobs is not None: - self.num_prompt_logprobs[req_id] = sampling_params.prompt_logprobs - if sampling_params.logit_bias is not None: - self.logit_bias[req_index] = sampling_params.logit_bias - - if sampling_params.allowed_token_ids: - self.has_allowed_token_ids.add(req_id) - if self.allowed_token_ids_mask_cpu_tensor is None: - # Lazy allocation for this tensor, which can be large. + if sampling_params := request.sampling_params: + if sampling_params.sampling_type == SamplingType.GREEDY: + # Avoid later division by zero. + self.temperature_cpu[req_index] = -1.0 + self.greedy_reqs.add(req_id) + else: + self.temperature_cpu[req_index] = sampling_params.temperature + self.random_reqs.add(req_id) + + self.top_p_cpu[req_index] = sampling_params.top_p + if sampling_params.top_p < 1: + self.top_p_reqs.add(req_id) + top_k = sampling_params.top_k + if 0 < top_k < self.vocab_size: + self.top_k_reqs.add(req_id) + else: + top_k = self.vocab_size + self.top_k_cpu[req_index] = top_k + self.min_p_cpu[req_index] = sampling_params.min_p + self.frequency_penalties_cpu[ + req_index] = sampling_params.frequency_penalty + if sampling_params.min_p > _SAMPLING_EPS: + self.min_p_reqs.add(req_id) + if sampling_params.frequency_penalty != 0.0: + self.frequency_penalties_reqs.add(req_id) + self.presence_penalties_cpu[ + req_index] = sampling_params.presence_penalty + if sampling_params.presence_penalty != 0.0: + self.presence_penalties_reqs.add(req_id) + self.repetition_penalties_cpu[ + req_index] = sampling_params.repetition_penalty + if sampling_params.repetition_penalty != 1.0: + self.repetition_penalties_reqs.add(req_id) + if sampling_params.min_tokens: + self.min_tokens[req_index] = ( + sampling_params.min_tokens, + sampling_params.all_stop_token_ids) + + # NOTE(woosuk): self.generators should not include the requests that + # do not have their own generator. + if request.generator is not None: + self.generators[req_index] = request.generator + + if sampling_params.logprobs is not None: + self.num_logprobs[req_id] = sampling_params.logprobs + if sampling_params.prompt_logprobs is not None: + self.num_prompt_logprobs[ + req_id] = sampling_params.prompt_logprobs + if sampling_params.logit_bias is not None: + self.logit_bias[req_index] = sampling_params.logit_bias + + if sampling_params.allowed_token_ids: + self.has_allowed_token_ids.add(req_id) + if self.allowed_token_ids_mask_cpu_tensor is None: + # Lazy allocation for this tensor, which can be large. + # False means we don't fill with -inf. + self.allowed_token_ids_mask = torch.zeros( + self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device=self.device) + self.allowed_token_ids_mask_cpu_tensor = torch.zeros( + self.max_num_reqs, + self.vocab_size, + dtype=torch.bool, + device="cpu") + self.allowed_token_ids_mask_cpu_tensor[req_index] = True # False means we don't fill with -inf. - self.allowed_token_ids_mask = torch.zeros(self.max_num_reqs, - self.vocab_size, - dtype=torch.bool, - device=self.device) - self.allowed_token_ids_mask_cpu_tensor = torch.zeros( - self.max_num_reqs, - self.vocab_size, - dtype=torch.bool, - device="cpu") - self.allowed_token_ids_mask_cpu_tensor[req_index] = True - # False means we don't fill with -inf. - self.allowed_token_ids_mask_cpu_tensor[req_index][ - sampling_params.allowed_token_ids] = False + self.allowed_token_ids_mask_cpu_tensor[req_index][ + sampling_params.allowed_token_ids] = False - if sampling_params.bad_words_token_ids: - self.bad_words_token_ids[ - req_index] = sampling_params.bad_words_token_ids + if sampling_params.bad_words_token_ids: + self.bad_words_token_ids[ + req_index] = sampling_params.bad_words_token_ids + else: + self.pooling_reqs.add(req_id) # Add request lora ID if request.lora_request: @@ -389,6 +399,7 @@ def remove_request(self, req_id: str) -> Optional[int]: # False means we don't fill with -inf. self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) self.bad_words_token_ids.pop(req_index, None) + self.pooling_reqs.discard(req_id) return req_index def swap_states(self, i1: int, i2: int) -> None: @@ -594,6 +605,14 @@ def _make_sampling_metadata(self) -> SamplingMetadata: bad_words_token_ids=self.bad_words_token_ids, ) + @property + def pooling_metadata(self) -> PoolingMetadata: + return PoolingMetadata( + prompt_lens=torch.from_numpy( + self.num_prompt_tokens[:self.num_reqs]), + prompt_token_ids=self.sampling_metadata.prompt_token_ids, + ) + def _make_prompt_token_ids_tensor(self) -> torch.Tensor: max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max() prompt_token_ids_cpu_tensor = torch.empty( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index f050732ec5b0..a7ad1c7a0166 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -325,7 +325,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: for new_req_data in scheduler_output.scheduled_new_reqs: req_id = new_req_data.req_id sampling_params = new_req_data.sampling_params - if sampling_params.sampling_type == SamplingType.RANDOM_SEED: + pooling_params = new_req_data.pooling_params + if sampling_params and \ + sampling_params.sampling_type == SamplingType.RANDOM_SEED: generator = torch.Generator(device=self.device) generator.manual_seed(sampling_params.seed) else: @@ -338,6 +340,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, sampling_params=sampling_params, + pooling_params=pooling_params, generator=generator, block_ids=new_req_data.block_ids, num_computed_tokens=new_req_data.num_computed_tokens, @@ -1057,6 +1060,27 @@ def execute_model( hidden_states = hidden_states[:num_scheduled_tokens] sample_hidden_states = hidden_states[logits_indices] + + if self.input_batch.pooling_reqs: + assert self.input_batch.num_reqs ==\ + len(self.input_batch.pooling_reqs), \ + "Either all or none of the requests in" \ + " a batch must be pooling request" + + pooler_output = self.model.pooler( + hidden_states=hidden_states, + pooling_metadata=self.input_batch.pooling_metadata) + + return ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + spec_token_ids=[], + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + ) + logits = self.model.compute_logits(sample_hidden_states, None) # Apply structured output bitmasks if present @@ -1219,7 +1243,7 @@ def execute_model( spec_token_ids=spec_token_ids, logprobs=logprobs_lists, prompt_logprobs_dict=prompt_logprobs_dict, - ) + pooler_output=[]) def generate_draft_token_ids( self, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b1d5c0f33854..26068dc919b5 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -264,6 +264,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: req_ids_to_add: list[str] = [] # Add new requests to the cached states. for new_req_data in scheduler_output.scheduled_new_reqs: + assert new_req_data.sampling_params is not None,\ + "Pooling is not supported in TPU yet" req_id = new_req_data.req_id sampling_params = new_req_data.sampling_params if sampling_params.sampling_type == SamplingType.RANDOM_SEED: @@ -279,6 +281,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, sampling_params=sampling_params, + pooling_params=None, generator=generator, block_ids=new_req_data.block_ids, num_computed_tokens=new_req_data.num_computed_tokens, @@ -726,6 +729,7 @@ def execute_model( req_ids=req_ids, req_id_to_index=self.input_batch.req_id_to_index, sampled_token_ids=valid_sampled_token_ids, + pooler_output=[], spec_token_ids=None, logprobs=None, prompt_logprobs_dict=prompt_logprobs_dict, From d60b22b552e762f5464fe6bd969bf59b60dee51b Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 08:30:28 -0300 Subject: [PATCH 05/63] First end-to-end working version of Bert embeddings in V1 Signed-off-by: Max de Bayser --- vllm/model_executor/models/bert.py | 1 - vllm/v1/core/sched/scheduler.py | 2 ++ vllm/v1/engine/async_llm.py | 12 ++++--- vllm/v1/engine/core_client.py | 3 +- vllm/v1/engine/output_processor.py | 53 ++++++++++++++++++------------ vllm/v1/metrics/loggers.py | 5 +-- vllm/v1/worker/gpu_input_batch.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 11 +++++-- 8 files changed, 55 insertions(+), 34 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index a096a53dd926..75b85927e5c1 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -452,7 +452,6 @@ def pooler( hidden_states: torch.Tensor, pooling_metadata: PoolingMetadata, ) -> Optional[PoolerOutput]: - print(f"{hidden_states[-1]=}") return self._pooler(hidden_states, pooling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 58a020ab42e1..d69f6913c058 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -555,6 +555,7 @@ def update_from_output( prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict spec_decoding_stats = SpecDecodingStats() if self.log_stats else None num_scheduled_tokens = scheduler_output.num_scheduled_tokens + pooler_outputs = model_runner_output.pooler_output new_running: list[Request] = [] outputs: list[EngineCoreOutput] = [] @@ -652,6 +653,7 @@ def update_from_output( finish_reason=request.get_finished_reason(), new_logprobs=new_logprobs, new_prompt_logprobs_tensors=prompt_logprobs_tensors, + pooling_output=pooler_outputs[req_index], stop_reason=request.stop_reason, events=request.take_events())) else: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 62163ebb410e..283365062753 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -22,7 +22,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import RequestOutputKind, SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -187,11 +187,12 @@ async def add_request( ) -> RequestOutputCollector: """Add new request to the AsyncLLM.""" - assert isinstance(params, SamplingParams), \ - "Pooling is not supported in V1" + is_pooling = isinstance(params, PoolingParams) # Create a new output collector for the request. - queue = RequestOutputCollector(output_kind=params.output_kind) + queue = RequestOutputCollector( + output_kind=params.output_kind \ + if not is_pooling else RequestOutputKind.FINAL_ONLY) # Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, @@ -200,7 +201,7 @@ async def add_request( prompt_adapter_request, priority) - if params.n == 1: + if is_pooling or params.n == 1: await self._add_request(request, None, 0, queue) return queue @@ -300,6 +301,7 @@ async def _run_output_handler(self): try: while True: + # 1) Pull EngineCoreOutputs from the EngineCore. outputs = await self.engine_core.get_output_async() num_outputs = len(outputs.outputs) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e948e59b8c42..16d87b4c9f96 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -623,7 +623,8 @@ async def process_outputs_socket(): async def get_output_async(self) -> EngineCoreOutputs: self._ensure_output_queue_task() assert self.outputs_queue is not None - return await self.outputs_queue.get() + result = await self.outputs_queue.get() + return result async def _send_input(self, request_type: EngineCoreRequestType, request: Any) -> None: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 1f73ddcaedde..e3615d21c7b6 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -9,7 +9,7 @@ from vllm.outputs import (CompletionOutput, PoolingOutput, PoolingRequestOutput, RequestOutput) -from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.sampling_params import RequestOutputKind from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest, FinishReason @@ -80,8 +80,8 @@ def __init__( output_kind: RequestOutputKind, prompt: Optional[str], prompt_token_ids: list[int], - logprobs_processor: LogprobsProcessor, - detokenizer: IncrementalDetokenizer, + logprobs_processor: Optional[LogprobsProcessor], + detokenizer: Optional[IncrementalDetokenizer], max_tokens_param: Optional[int], arrival_time: float, queue: Optional[RequestOutputCollector], @@ -115,11 +115,24 @@ def from_new_request( log_stats: bool, ) -> "RequestState": - sampling_params = request.sampling_params \ - if request.sampling_params \ - else SamplingParams.from_optional() - if not sampling_params.detokenize: - tokenizer = None + if sampling_params := request.sampling_params: + if not sampling_params.detokenize: + tokenizer = None + output_kind = sampling_params.output_kind + logprobs_processor = LogprobsProcessor.from_new_request( + tokenizer=tokenizer, + request=request, + ) + detokenizer = IncrementalDetokenizer.from_new_request( + tokenizer=tokenizer, + request=request, + ) + max_tokens_param = sampling_params.max_tokens + else: + logprobs_processor = None + detokenizer = None + max_tokens_param = None + output_kind = RequestOutputKind.FINAL_ONLY return cls( request_id=request.request_id, @@ -127,19 +140,12 @@ def from_new_request( request_index=request_index, lora_name=(request.lora_request.name if request.lora_request is not None else None), - output_kind=sampling_params.output_kind, + output_kind=output_kind, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, - logprobs_processor=LogprobsProcessor.from_new_request( - tokenizer=tokenizer, - request=request, - ), - detokenizer=IncrementalDetokenizer.from_new_request( - tokenizer=tokenizer, - request=request, - ), - max_tokens_param=(sampling_params.max_tokens - if sampling_params is not None else None), + logprobs_processor=logprobs_processor, + detokenizer=detokenizer, + max_tokens_param=max_tokens_param, arrival_time=request.arrival_time, queue=queue, log_stats=log_stats, @@ -161,7 +167,7 @@ def make_request_output( return None request_id = self.request_id - if pooling_output: + if pooling_output is not None: return self._new_request_output( request_id, [self._new_pooling_output(pooling_output)], finished) @@ -195,6 +201,7 @@ def _new_request_output( prompt_token_ids=self.prompt_token_ids, finished=finished, ) + assert self.logprobs_processor is not None if self.output_kind == RequestOutputKind.DELTA: # Side effect: logprobs processor forgets prompt logprobs prompt_logprobs = self.logprobs_processor.pop_prompt_logprobs() @@ -217,6 +224,8 @@ def _new_completion_output( stop_reason: Union[int, str, None], ) -> CompletionOutput: + assert self.detokenizer is not None + assert self.logprobs_processor is not None finished = finish_reason is not None delta = self.output_kind == RequestOutputKind.DELTA @@ -358,7 +367,9 @@ def process_outputs( req_state.is_prefilling = False - if not pooling_output: + if pooling_output is None: + assert req_state.detokenizer is not None + assert req_state.logprobs_processor is not None # 2) Detokenize the token ids into text and perform stop checks. stop_string = req_state.detokenizer.update( new_token_ids, finish_reason == FinishReason.STOP) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 73883d9a735d..0e9624d95794 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -408,8 +408,9 @@ def record(self, scheduler_stats: SchedulerStats, finished_request.num_prompt_tokens) self.histogram_num_generation_tokens_request.observe( finished_request.num_generation_tokens) - self.histogram_max_tokens_request.observe( - finished_request.max_tokens_param) + if finished_request.max_tokens_param: + self.histogram_max_tokens_request.observe( + finished_request.max_tokens_param) if self.gauge_lora_info is not None: running_lora_adapters = \ diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index b53229752525..086b22d80512 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -609,7 +609,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata: def pooling_metadata(self) -> PoolingMetadata: return PoolingMetadata( prompt_lens=torch.from_numpy( - self.num_prompt_tokens[:self.num_reqs]), + self.num_prompt_tokens[:self.num_reqs]).to(self.device), prompt_token_ids=self.sampling_metadata.prompt_token_ids, ) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a7ad1c7a0166..3a3c1fea7945 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1071,14 +1071,19 @@ def execute_model( hidden_states=hidden_states, pooling_metadata=self.input_batch.pooling_metadata) + # any token will do because max tokens is 1 + sampled_tokens = [[0]] * self.input_batch.num_reqs + return ModelRunnerOutput( req_ids=self.input_batch.req_ids, req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=[], - spec_token_ids=[], + sampled_token_ids=sampled_tokens, + spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - pooler_output=pooler_output, + pooler_output=[ + o.data.to("cpu") for o in pooler_output.outputs + ], ) logits = self.model.compute_logits(sample_hidden_states, None) From 6bebbb8b46a664cbeda5f8dfc5e9ce9f1d4bf07a Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 09:45:43 -0300 Subject: [PATCH 06/63] Support warmup for pooling models in V1 ... and disable cuda graphs for these models. Signed-off-by: Max de Bayser --- vllm/config.py | 3 ++ vllm/model_executor/models/bert.py | 24 +------------ vllm/v1/worker/gpu_model_runner.py | 54 +++++++++++++++++++++++++++--- vllm/v1/worker/gpu_worker.py | 12 +++++-- 4 files changed, 62 insertions(+), 31 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 92e887e08639..a3f035bfc459 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -492,6 +492,9 @@ def _init_pooler_config( ) -> Optional["PoolerConfig"]: if self.runner_type == "pooling": + logger.warning("CUDA graph is not supported for pooling yet, " + "fallback to the eager mode.") + self.enforce_eager = True user_config = override_pooler_config or PoolerConfig() base_config = get_pooling_config(self.model, self.revision) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 75b85927e5c1..2698b95c4979 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -18,12 +18,11 @@ from vllm.model_executor.layers.pooler import (CrossEncodingPooler, Pooler, PoolingType) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler +from vllm.model_executor.layers.sampler import get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) @@ -426,27 +425,6 @@ def forward( intermediate_tensors=intermediate_tensors) return hidden_states - # TODO: Remove test scaffolding after pooling is implemented - def compute_logits( - self, - hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[torch.Tensor]: - print(f"{hidden_states=}") - logits = torch.zeros( - (hidden_states.shape[0], self.model.config.vocab_size), - dtype=torch.half, - device=hidden_states.device) - logits[:, 333] = 1.0 - - return logits - - # TODO: Remove test scaffolding after pooling is implemented - def sample(self, logits: torch.Tensor, - sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - def pooler( self, hidden_states: torch.Tensor, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3a3c1fea7945..c76c424a9d55 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -33,6 +33,7 @@ SlidingWindowSpec) from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, ModelRunnerOutput) +from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.spec_decode.eagle import EagleProposer @@ -99,6 +100,7 @@ def __init__( or self.interleaved_sliding_window) self.is_multimodal_model = model_config.is_multimodal_model + self.is_pooling_model = model_config.pooler_config is not None self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) @@ -1460,7 +1462,7 @@ def _dummy_run( ) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - return hidden_states[logit_indices] + return hidden_states, hidden_states[logit_indices], num_reqs @torch.inference_mode() def _dummy_sampler_run( @@ -1535,6 +1537,43 @@ def _dummy_sampler_run( ) return sampler_output + @torch.inference_mode() + def _dummy_pooler_run( + self, + num_tokens: int, + num_reqs: int, + hidden_states: torch.Tensor, + ) -> torch.Tensor: + + req_num_tokens = num_tokens // num_reqs + + dummy_metadata = PoolingMetadata( + prompt_lens=torch.tensor([req_num_tokens] * num_reqs, + device=self.device), + prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), + dtype=torch.int32, + device=self.device)) + print(f"{num_tokens=}") + print(f"{num_reqs=}") + print(f"{req_num_tokens=}") + print(f"{hidden_states.shape=}") + print(f"{dummy_metadata.prompt_lens=}") + print(f"{dummy_metadata.prompt_token_ids=}") + + try: + pooler_output = self.model.pooler(hidden_states=hidden_states, + pooling_metadata=dummy_metadata) + except RuntimeError as e: + if 'out of memory' in str(e): + raise RuntimeError( + "CUDA out of memory occurred when warming up pooler with " + f"{num_reqs} dummy requests. Please try lowering " + "`max_num_seqs` or `gpu_memory_utilization` when " + "initializing the engine.") from e + else: + raise e + return pooler_output + def profile_run(self) -> None: # Profile with multimodal encoder & encoder cache. # TODO: handle encoder-decoder models once we support them. @@ -1602,13 +1641,18 @@ def profile_run(self) -> None: # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) - hidden_states = self._dummy_run(self.max_num_tokens) + hidden_states, last_hidden_states, num_reqs = self._dummy_run( + self.max_num_tokens) if get_pp_group().is_last_rank: - sampler_output = self._dummy_sampler_run(hidden_states) + if self.is_pooling_model: + output = self._dummy_pooler_run(self.max_num_tokens, num_reqs, + hidden_states) + else: + output = self._dummy_sampler_run(last_hidden_states) else: - sampler_output = None + output = None torch.cuda.synchronize() - del hidden_states, sampler_output + del hidden_states, output self.encoder_cache.clear() gc.collect() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2972e0ffb3ba..f222034ff8f7 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -223,9 +223,15 @@ def compile_or_warm_up_model(self) -> None: if get_pp_group().is_last_rank: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) - self.model_runner._dummy_sampler_run( - hidden_states=self.model_runner._dummy_run( - num_tokens=max_num_reqs)) + + hidden_states, last_hidden_states, num_reqs = \ + self.model_runner._dummy_run(num_tokens=max_num_reqs) + if self.model_runner.is_pooling_model: + self.model_runner._dummy_pooler_run(max_num_reqs, num_reqs, + hidden_states) + else: + self.model_runner._dummy_sampler_run( + hidden_states=last_hidden_states) # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. From 6dafd71c5c574f9e2c5633ca9a07e797a8990fee Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 11:57:26 -0300 Subject: [PATCH 07/63] address review comments Signed-off-by: Max de Bayser --- vllm/model_executor/layers/pooler.py | 2 +- vllm/model_executor/models/bert.py | 2 +- vllm/pooling_params.py | 7 +++++++ vllm/v1/engine/async_llm.py | 6 ++---- vllm/v1/engine/core.py | 1 - vllm/v1/engine/llm_engine.py | 2 +- vllm/v1/engine/output_processor.py | 3 ++- 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 8e9155c17f9d..147d34aac219 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -197,7 +197,7 @@ def get_prompt_token_ids( for i, num in enumerate(pooling_metadata.prompt_lens) ] return [ - seq_data_i.prompt_token_ids + torch.tensor(seq_data_i.prompt_token_ids) for seq_data_i in pooling_metadata.seq_data.values() ] diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 2698b95c4979..7ee5681f5833 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -116,7 +116,7 @@ def forward( self, hidden_states: torch.Tensor, ) -> torch.Tensor: - for i, layer in enumerate(self.layer): + for layer in self.layer: hidden_states = layer(hidden_states) return hidden_states diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index 061232eb1183..bd7e61e268dd 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -4,6 +4,8 @@ import msgspec +from vllm.sampling_params import RequestOutputKind + class PoolingParams( msgspec.Struct, @@ -15,6 +17,7 @@ class PoolingParams( additional_data: Any additional data needed for pooling. """ additional_data: Optional[Any] = None + output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY def clone(self) -> "PoolingParams": """Returns a deep copy of the PoolingParams instance.""" @@ -23,3 +26,7 @@ def clone(self) -> "PoolingParams": def __repr__(self) -> str: return (f"PoolingParams(" f"additional_metadata={self.additional_data})") + + def __post_init__(self) -> None: + assert self.output_kind == RequestOutputKind.FINAL_ONLY,\ + "For pooling output_kind has to be FINAL_ONLY" diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 283365062753..4a90c2ca701c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -22,7 +22,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext @@ -190,9 +190,7 @@ async def add_request( is_pooling = isinstance(params, PoolingParams) # Create a new output collector for the request. - queue = RequestOutputCollector( - output_kind=params.output_kind \ - if not is_pooling else RequestOutputKind.FINAL_ONLY) + queue = RequestOutputCollector(output_kind=params.output_kind) # Convert Input --> Request. request = self.processor.process_inputs(request_id, prompt, params, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index aab0436030d7..f2e02c900849 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -150,7 +150,6 @@ def _initialize_kv_caches( ]) num_gpu_blocks = kv_cache_configs[0].num_blocks else: - kv_cache_configs = [] kv_cache_configs = [ KVCacheConfig(num_blocks=1, tensors={}, kv_cache_groups=[]) for kv_cache_spec_one_worker in kv_cache_specs diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index d905e57aca35..f9af30b415be 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -211,7 +211,7 @@ def add_request( # Add the request to EngineCore. self.engine_core.add_request(child_request) - def step(self) -> list[RequestOutput | PoolingRequestOutput]: + def step(self) -> list[Union[RequestOutput, PoolingRequestOutput]]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index e3615d21c7b6..9e283f1a03f1 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -132,7 +132,8 @@ def from_new_request( logprobs_processor = None detokenizer = None max_tokens_param = None - output_kind = RequestOutputKind.FINAL_ONLY + assert request.pooling_params is not None + output_kind = request.pooling_params.output_kind return cls( request_id=request.request_id, From e2724a2feb5af5f417005e493bf6716443f5d260 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 12:02:00 -0300 Subject: [PATCH 08/63] address review comments Signed-off-by: Max de Bayser --- vllm/outputs.py | 9 ++------- vllm/v1/engine/output_processor.py | 1 - 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/vllm/outputs.py b/vllm/outputs.py index 44266591187d..014e8d5d8823 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -344,11 +344,10 @@ class PoolingRequestOutput(Generic[_O]): finished (bool): A flag indicating whether the pooling is completed. """ - def __init__(self, request_id: str, outputs: _O, prompt: Optional[str], + def __init__(self, request_id: str, outputs: _O, prompt_token_ids: list[int], finished: bool): self.request_id = request_id self.prompt_token_ids = prompt_token_ids - self.prompt = prompt self.finished = finished self.outputs = outputs @@ -360,10 +359,9 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput": data = pooled_data.to(dtype=torch.float32, device="cpu") output = PoolingOutput(data) prompt_token_ids = seq_group.prompt_token_ids - prompt = seq_group.prompt finished = seq_group.is_finished() - return PoolingRequestOutput(seq_group.request_id, output, prompt, + return PoolingRequestOutput(seq_group.request_id, output, prompt_token_ids, finished) def __repr__(self): @@ -428,7 +426,6 @@ def from_base(request_output: PoolingRequestOutput): return EmbeddingRequestOutput( request_id=request_output.request_id, outputs=EmbeddingOutput.from_base(request_output.outputs), - prompt=request_output.prompt, prompt_token_ids=request_output.prompt_token_ids, finished=request_output.finished, ) @@ -467,7 +464,6 @@ def from_base(request_output: PoolingRequestOutput): return ClassificationRequestOutput( request_id=request_output.request_id, outputs=ClassificationOutput.from_base(request_output.outputs), - prompt=request_output.prompt, prompt_token_ids=request_output.prompt_token_ids, finished=request_output.finished, ) @@ -507,7 +503,6 @@ def from_base(request_output: PoolingRequestOutput): return ScoringRequestOutput( request_id=request_output.request_id, outputs=ScoringOutput.from_base(request_output.outputs), - prompt=request_output.prompt, prompt_token_ids=request_output.prompt_token_ids, finished=request_output.finished, ) diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 9e283f1a03f1..f763ac3cd50e 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -198,7 +198,6 @@ def _new_request_output( return PoolingRequestOutput( request_id=request_id, outputs=outputs[0], - prompt=self.prompt, prompt_token_ids=self.prompt_token_ids, finished=finished, ) From 56ff6cdad32a2aec216790d900f7c5547b605f9d Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 12:11:27 -0300 Subject: [PATCH 09/63] remove debug prints Signed-off-by: Max de Bayser --- vllm/v1/worker/gpu_model_runner.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c76c424a9d55..6a875d8c3f6e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1553,12 +1553,6 @@ def _dummy_pooler_run( prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), dtype=torch.int32, device=self.device)) - print(f"{num_tokens=}") - print(f"{num_reqs=}") - print(f"{req_num_tokens=}") - print(f"{hidden_states.shape=}") - print(f"{dummy_metadata.prompt_lens=}") - print(f"{dummy_metadata.prompt_token_ids=}") try: pooler_output = self.model.pooler(hidden_states=hidden_states, From fc57edd026c183d4f0b13528b4b626f9796739eb Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 14:28:27 -0300 Subject: [PATCH 10/63] address review comments Signed-off-by: Max de Bayser --- vllm/v1/engine/core_client.py | 3 +-- vllm/v1/engine/output_processor.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 16d87b4c9f96..e948e59b8c42 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -623,8 +623,7 @@ async def process_outputs_socket(): async def get_output_async(self) -> EngineCoreOutputs: self._ensure_output_queue_task() assert self.outputs_queue is not None - result = await self.outputs_queue.get() - return result + return await self.outputs_queue.get() async def _send_input(self, request_type: EngineCoreRequestType, request: Any) -> None: diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index f763ac3cd50e..330045f08023 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -158,7 +158,7 @@ def make_request_output( pooling_output: Optional[torch.Tensor], finish_reason: Optional[FinishReason], stop_reason: Union[int, str, None], - ) -> Optional[Union[RequestOutput | PoolingRequestOutput]]: + ) -> Optional[Union[RequestOutput, PoolingRequestOutput]]: finished = finish_reason is not None final_only = self.output_kind == RequestOutputKind.FINAL_ONLY @@ -191,7 +191,7 @@ def _new_request_output( request_id: str, outputs: Union[list[CompletionOutput], list[PoolingOutput]], finished: bool, - ) -> RequestOutput | PoolingRequestOutput: + ) -> Union[RequestOutput, PoolingRequestOutput]: if isinstance(outputs[0], PoolingOutput): assert len(outputs) == 1 @@ -346,7 +346,7 @@ def process_outputs( ********************************************************** """ - request_outputs: list[RequestOutput | PoolingRequestOutput] = [] + request_outputs: list[Union[RequestOutput, PoolingRequestOutput]] = [] reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id From 64a0e62b6e70c7a9a95e3c30baaf632685d795fc Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 23:44:26 -0300 Subject: [PATCH 11/63] Fix cross encoder models in V1 and enable tests for pooling models Signed-off-by: Max de Bayser --- tests/entrypoints/openai/test_embedding.py | 32 +++++++++++++------ tests/entrypoints/openai/test_rerank.py | 8 +++++ tests/entrypoints/openai/test_score.py | 9 ++++++ .../embedding/language/test_cls_models.py | 8 +++++ .../embedding/language/test_embedding.py | 20 ++++++++---- .../language/test_jina_reranker_v2.py | 8 +++++ .../models/embedding/language/test_scoring.py | 8 +++++ vllm/entrypoints/llm.py | 2 +- vllm/model_executor/layers/pooler.py | 1 - vllm/model_executor/models/roberta.py | 5 ++- vllm/v1/core/sched/output.py | 2 ++ vllm/v1/engine/__init__.py | 1 + vllm/v1/engine/processor.py | 1 + vllm/v1/request.py | 3 ++ vllm/v1/serial_utils.py | 2 +- vllm/v1/worker/gpu_input_batch.py | 29 +++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 28 +++++++++++++++- vllm/v1/worker/tpu_model_runner.py | 1 + 18 files changed, 146 insertions(+), 22 deletions(-) diff --git a/tests/entrypoints/openai/test_embedding.py b/tests/entrypoints/openai/test_embedding.py index 0d1c936da759..bfc4efbb019d 100644 --- a/tests/entrypoints/openai/test_embedding.py +++ b/tests/entrypoints/openai/test_embedding.py @@ -11,12 +11,21 @@ from vllm.entrypoints.openai.protocol import EmbeddingResponse from vllm.transformers_utils.tokenizer import get_tokenizer +from ...models.embedding.utils import check_embeddings_close from ...utils import RemoteOpenAIServer MODEL_NAME = "intfloat/multilingual-e5-small" DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501 +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + @pytest.fixture(scope="module") def server(): args = [ @@ -201,19 +210,24 @@ async def test_batch_base64_embedding(client: openai.AsyncOpenAI, np.frombuffer(base64.b64decode(data.embedding), dtype="float32").tolist()) - assert responses_float.data[0].embedding == decoded_responses_base64_data[ - 0] - assert responses_float.data[1].embedding == decoded_responses_base64_data[ - 1] + check_embeddings_close( + embeddings_0_lst=[d.embedding for d in responses_float.data], + embeddings_1_lst=decoded_responses_base64_data, + name_0="float", + name_1="base64", + tol=1e-2, + ) # Default response is float32 decoded from base64 by OpenAI Client responses_default = await client.embeddings.create(input=input_texts, model=model_name) - - assert responses_float.data[0].embedding == responses_default.data[ - 0].embedding - assert responses_float.data[1].embedding == responses_default.data[ - 1].embedding + check_embeddings_close( + embeddings_0_lst=[d.embedding for d in responses_float.data], + embeddings_1_lst=[d.embedding for d in responses_default.data], + name_0="float", + name_1="default", + tol=1e-2, + ) @pytest.mark.asyncio diff --git a/tests/entrypoints/openai/test_rerank.py b/tests/entrypoints/openai/test_rerank.py index ba11cd3a29a8..b386e37512bc 100644 --- a/tests/entrypoints/openai/test_rerank.py +++ b/tests/entrypoints/openai/test_rerank.py @@ -11,6 +11,14 @@ DTYPE = "bfloat16" +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + @pytest.fixture(scope="module") def server(): args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE] diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py index b756680ea9f2..d880920fa869 100644 --- a/tests/entrypoints/openai/test_score.py +++ b/tests/entrypoints/openai/test_score.py @@ -12,6 +12,15 @@ from ...utils import RemoteOpenAIServer + +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + MODELS = [ { "name": "BAAI/bge-reranker-v2-m3", diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py index 6a3cd8a5c594..18059749fd8a 100644 --- a/tests/models/embedding/language/test_cls_models.py +++ b/tests/models/embedding/language/test_cls_models.py @@ -9,6 +9,14 @@ from vllm.platforms import current_platform +# TODO: enable when float32 is supported by V1 +# @pytest.fixture(autouse=True) +# def v1(run_with_both_engines): +# # Simple autouse wrapper to run both engines for each test +# # This can be promoted up to conftest.py to run for every +# # test in a package +# pass + @pytest.mark.parametrize( "model", diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 5deb35fa3210..ec470681ff85 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -11,6 +11,14 @@ from ..utils import check_embeddings_close +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + @pytest.mark.parametrize( "model", [ @@ -19,14 +27,14 @@ marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + #pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), # [Decoder-only] - pytest.param("BAAI/bge-multilingual-gemma2", - marks=[pytest.mark.core_model]), - pytest.param("intfloat/e5-mistral-7b-instruct", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + #pytest.param("BAAI/bge-multilingual-gemma2", + # marks=[pytest.mark.core_model]), + #pytest.param("intfloat/e5-mistral-7b-instruct", + # marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + #pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], diff --git a/tests/models/embedding/language/test_jina_reranker_v2.py b/tests/models/embedding/language/test_jina_reranker_v2.py index ab88fa9ba636..2afde161ca4f 100644 --- a/tests/models/embedding/language/test_jina_reranker_v2.py +++ b/tests/models/embedding/language/test_jina_reranker_v2.py @@ -28,6 +28,14 @@ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + @pytest.fixture(scope="module", params=MODELS) def model_name(request): yield request.param diff --git a/tests/models/embedding/language/test_scoring.py b/tests/models/embedding/language/test_scoring.py index d6408258ffce..783759d10e58 100644 --- a/tests/models/embedding/language/test_scoring.py +++ b/tests/models/embedding/language/test_scoring.py @@ -29,6 +29,14 @@ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + @pytest.fixture(scope="module", params=MODELS) def model_name(request): yield request.param diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index f39b011c9301..23b890a488ae 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1142,7 +1142,7 @@ def score( # the tokenizer for models such as # "cross-encoder/ms-marco-MiniLM-L-6-v2" doesn't support passing # lists of tokens to the `text` and `text_pair` kwargs - tokenizer = self.llm_engine.get_tokenizer() + tokenizer = self.get_tokenizer() def ensure_str(prompt: SingletonPrompt): if isinstance(prompt, dict): diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 147d34aac219..9183943f1989 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -323,7 +323,6 @@ def forward( pooling_metadata: PoolingMetadata, ) -> PoolerOutput: """Pools sentence pair scores from the hidden_states.""" - prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) offset = 0 diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index b52032b1cdc8..63533e97e6bb 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -19,7 +19,7 @@ from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) -from .interfaces import SupportsCrossEncoding, SupportsV0Only +from .interfaces import SupportsCrossEncoding def roberta_task_weights_filter( @@ -193,8 +193,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): assert len(loaded), "Unable to load RobertaEmbeddingModel" -class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, - SupportsV0Only): +class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): """A model that uses Roberta to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index b8895be0dbc4..928a46058672 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -21,6 +21,7 @@ class NewRequestData: req_id: str prompt_token_ids: list[int] + token_type_ids: Optional[list[int]] prompt: Optional[str] mm_inputs: list[MultiModalKwargs] mm_hashes: list[str] @@ -40,6 +41,7 @@ def from_request( return cls( req_id=request.request_id, prompt_token_ids=request.prompt_token_ids, + token_type_ids=request.token_type_ids, prompt=request.prompt, mm_inputs=request.mm_inputs, mm_hashes=request.mm_hashes, diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index f9ec79db4457..008425445074 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -54,6 +54,7 @@ class EngineCoreRequest( # Detokenizer, but set to None when it is added to EngineCoreClient. prompt: Optional[str] prompt_token_ids: list[int] + token_type_ids: Optional[list[int]] mm_inputs: Optional[list[MultiModalKwargs]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 2017c69c5dac..2cc80cedcb3c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -275,6 +275,7 @@ def process_inputs( request_id=request_id, prompt=decoder_inputs.get("prompt"), prompt_token_ids=decoder_inputs["prompt_token_ids"], + token_type_ids=decoder_inputs.get("token_type_ids"), mm_inputs=sorted_mm_inputs, mm_hashes=sorted_mm_hashes, mm_placeholders=sorted_mm_positions, diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 711b3b373739..85b3560f6a10 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -24,6 +24,7 @@ def __init__( request_id: str, prompt: Optional[str], prompt_token_ids: list[int], + token_type_ids: Optional[list[int]], multi_modal_inputs: Optional[list["MultiModalKwargs"]], multi_modal_hashes: Optional[list[str]], multi_modal_placeholders: Optional[list["PlaceholderRange"]], @@ -58,6 +59,7 @@ def __init__( self.prompt = prompt self.prompt_token_ids = prompt_token_ids + self.token_type_ids = token_type_ids self.num_prompt_tokens = len(self.prompt_token_ids) self._output_token_ids: list[int] = [] self._all_token_ids: list[int] = self.prompt_token_ids.copy() @@ -88,6 +90,7 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": request_id=request.request_id, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, + token_type_ids=request.token_type_ids, multi_modal_inputs=request.mm_inputs, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index 146d7d747f1a..987835b6fba5 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -38,7 +38,7 @@ def decode(self, obj: Any): def custom_enc_hook(obj: Any) -> Any: - if isinstance(obj, torch.Tensor): + if isinstance(obj, torch.Tensor) and obj.dtype is not torch.bfloat16: # NOTE(rob): it is fastest to use numpy + pickle # when serializing torch tensors. # https://gist.github.com/tlrmchlsmth/8067f1b24a82b6e2f90450e7764fa103 # noqa: E501 diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 086b22d80512..e98887af001c 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -26,6 +26,7 @@ class CachedRequestState: req_id: str prompt_token_ids: list[int] + token_type_ids: Optional[list[int]] prompt: Optional[str] mm_inputs: list[MultiModalKwargs] mm_positions: list[PlaceholderRange] @@ -88,6 +89,8 @@ def __init__( pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() + self.token_type_ids_cpu_tensor = None + self._token_type_ids_cpu = None self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) @@ -228,6 +231,22 @@ def __init__( self.pooling_reqs: set[str] = set() + @property + def token_type_ids_cpu(self) -> np.ndarray: + if self._token_type_ids_cpu is None: + self.token_type_ids_cpu_tensor = torch.zeros( + self.token_ids_cpu_tensor.shape, + device="cpu", + dtype=torch.int8, + pin_memory=False, + ) + self._token_type_ids_cpu = cast( + torch.Tensor, self.token_type_ids_cpu_tensor).numpy() + return self._token_type_ids_cpu + + def has_token_types(self) -> bool: + return self._token_type_ids_cpu is not None + @property def req_ids(self) -> list[str]: # None elements should only be present transiently @@ -258,6 +277,9 @@ def add_request( self.num_prompt_tokens[req_index] = num_prompt_tokens self.token_ids_cpu[ req_index, :num_prompt_tokens] = request.prompt_token_ids + if request.token_type_ids is not None: + self.token_type_ids_cpu[ + req_index, :num_prompt_tokens] = request.token_type_ids start_idx = num_prompt_tokens end_idx = start_idx + len(request.output_token_ids) self.token_ids_cpu[req_index, @@ -443,6 +465,10 @@ def swap_states(self, i1: int, i2: int) -> None: tmp = self.token_ids_cpu[i1, ...].copy() self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] self.token_ids_cpu[i2, ...] = tmp + if self.has_token_types(): + tmp2 = self.token_type_ids_cpu[i1, ...].copy() + self.token_type_ids_cpu[i1, ...] = self.token_type_ids_cpu[i2, ...] + self.token_type_ids_cpu[i2, ...] = tmp2 swap_dict_values(self.generators, i1, i2) swap_dict_values(self.min_tokens, i1, i2) @@ -494,6 +520,9 @@ def condense(self, empty_req_indices: list[int]) -> None: num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ last_req_index, :num_tokens] + if self.has_token_types(): + self.token_type_ids_cpu[empty_index, :num_tokens] = \ + self.token_type_ids_cpu[last_req_index, :num_tokens] self.num_tokens[empty_index] = num_tokens self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 6a875d8c3f6e..cc5b3031ad03 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3,7 +3,7 @@ import gc import time import weakref -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Optional, Union, cast import numpy as np import torch @@ -205,6 +205,7 @@ def __init__( self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + self.token_type_ids = None # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: Optional[IntermediateTensors] = None @@ -271,6 +272,13 @@ def __init__( pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + def get_token_type_ids(self) -> Optional[torch.Tensor]: + if self.token_type_ids is None: + self.token_type_ids = torch.zeros(self.max_num_tokens, + dtype=torch.int8, + device=self.device) + return self.token_type_ids + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -338,6 +346,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, + token_type_ids=new_req_data.token_type_ids, prompt=new_req_data.prompt, mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, @@ -537,6 +546,13 @@ def _prepare_inputs( 0, torch.from_numpy(token_indices), out=self.input_ids_cpu[:total_num_scheduled_tokens]) + if self.input_batch.token_type_ids_cpu_tensor is not None: + token_type_ids = torch.index_select( + self.input_batch.token_type_ids_cpu_tensor.flatten(), 0, + torch.from_numpy(token_indices)) + # Copy the tensors to the GPU. + self.get_token_type_ids()[:total_num_scheduled_tokens]\ + .copy_(token_type_ids, non_blocking=True) # Calculate the slot mapping. # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] @@ -1008,11 +1024,17 @@ def execute_model( num_input_tokens = num_scheduled_tokens attn_metadata.num_input_tokens = num_input_tokens + has_token_types = self.token_type_ids is not None + model_kwargs = {} + if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. input_ids = self.input_ids[:num_scheduled_tokens] + if has_token_types: + model_kwargs["token_type_ids"] = cast( + torch.Tensor, self.token_type_ids)[:num_scheduled_tokens] if encoder_outputs: inputs_embeds = self.model.get_input_embeddings( input_ids, encoder_outputs) @@ -1028,6 +1050,9 @@ def execute_model( # multimodal models, it is not desirable for performance since # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] + if has_token_types: + model_kwargs["token_type_ids"] = cast( + torch.Tensor, self.token_type_ids)[:num_input_tokens] inputs_embeds = None if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] @@ -1055,6 +1080,7 @@ def execute_model( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, + **model_kwargs, ) if not get_pp_group().is_last_rank: # For mid-pipeline stages, return the hidden states. diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 26068dc919b5..144ece9e0e8d 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -277,6 +277,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, + token_type_ids=new_req_data.token_type_ids, prompt=new_req_data.prompt, mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, From 4014d411aa28503b2175603fc88d617a4af1ea08 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 7 Apr 2025 23:49:12 -0300 Subject: [PATCH 12/63] address review comments Signed-off-by: Max de Bayser --- vllm/model_executor/models/bert.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 7ee5681f5833..d6011d9ffba8 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -419,11 +419,10 @@ def forward( intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.model(input_ids=input_ids, - position_ids=positions, - inputs_embeds=inputs_embeds, - intermediate_tensors=intermediate_tensors) - return hidden_states + return self.model(input_ids=input_ids, + position_ids=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors) def pooler( self, From 902c129236eabc37ccca623a66b4d02fb873bde0 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 8 Apr 2025 00:05:19 -0300 Subject: [PATCH 13/63] address review comments Signed-off-by: Max de Bayser --- vllm/v1/engine/llm_engine.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f9af30b415be..d03f206712cf 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -2,7 +2,7 @@ from collections.abc import Mapping from copy import copy -from typing import Any, Callable, Optional, Union, cast +from typing import Any, Callable, Optional, Union from typing_extensions import TypeVar @@ -189,9 +189,7 @@ def add_request( prompt_adapter_request, priority) - n = params.n if isinstance(params, SamplingParams) else 1 - - if n == 1: + if not isinstance(params, SamplingParams): # Make a new RequestState and queue. self.output_processor.add_request(request, None, 0) # Add the request to EngineCore. @@ -199,7 +197,8 @@ def add_request( return # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, cast(SamplingParams, params)) + parent_req = ParentRequest(request_id, params) + n = params.n for idx in range(n): request_id, params = parent_req.get_child_info(idx) child_request = request if idx == n - 1 else copy(request) From 2c68855990e9960af58939a455fcd709aadb9c84 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 8 Apr 2025 00:07:14 -0300 Subject: [PATCH 14/63] re-enable large embedding models Signed-off-by: Max de Bayser --- tests/models/embedding/language/test_embedding.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index ec470681ff85..108fcbd8a5ae 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -27,14 +27,14 @@ def v1(run_with_both_engines): marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - #pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), + pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), # [Decoder-only] - #pytest.param("BAAI/bge-multilingual-gemma2", - # marks=[pytest.mark.core_model]), - #pytest.param("intfloat/e5-mistral-7b-instruct", - # marks=[pytest.mark.core_model, pytest.mark.cpu_model]), + pytest.param("BAAI/bge-multilingual-gemma2", + marks=[pytest.mark.core_model]), + pytest.param("intfloat/e5-mistral-7b-instruct", + marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), - #pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], From 8afd8f5ab21bbe3222283a923f077fc7500dcc6b Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 8 Apr 2025 07:35:12 -0300 Subject: [PATCH 15/63] address review comments Signed-off-by: Max de Bayser --- vllm/v1/engine/llm_engine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index d03f206712cf..14e6270e4847 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -189,7 +189,7 @@ def add_request( prompt_adapter_request, priority) - if not isinstance(params, SamplingParams): + if not isinstance(params, SamplingParams) or (n := params.n) == 1: # Make a new RequestState and queue. self.output_processor.add_request(request, None, 0) # Add the request to EngineCore. @@ -198,7 +198,6 @@ def add_request( # Fan out child requests (for n>1). parent_req = ParentRequest(request_id, params) - n = params.n for idx in range(n): request_id, params = parent_req.get_child_info(idx) child_request = request if idx == n - 1 else copy(request) From 4b066a3252246f13b0d3147662c38aab0ac124b5 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 29 Apr 2025 21:33:59 -0300 Subject: [PATCH 16/63] fix merge problems Signed-off-by: Max de Bayser --- vllm/model_executor/layers/pooler.py | 22 ++++++++++++---------- vllm/v1/worker/gpu_model_runner.py | 3 ++- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index c2e4506c1ad6..5a333fe2c095 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -240,17 +240,19 @@ def __init__(self, *, normalize: bool, softmax: bool) -> None: def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): - dimensions_list = [ - pooling_param.dimensions - for _, pooling_param in pooling_metadata.seq_groups - ] - if any(d is not None for d in dimensions_list): - # change the output dimension - assert len(pooled_data) == len(dimensions_list) - pooled_data = [ - vecs if d is None else vecs[..., :d] - for vecs, d in zip(pooled_data, dimensions_list) + if isinstance(pooling_metadata, V0PoolingMetadata): + # TODO: enable matryoshka for V1 + dimensions_list = [ + pooling_param.dimensions + for _, pooling_param in pooling_metadata.seq_groups ] + if any(d is not None for d in dimensions_list): + # change the output dimension + assert len(pooled_data) == len(dimensions_list) + pooled_data = [ + vecs if d is None else vecs[..., :d] + for vecs, d in zip(pooled_data, dimensions_list) + ] if self.normalize: if isinstance(pooled_data, list): diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index ce125f15ec8d..e6b75fd5f90f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1125,6 +1125,7 @@ def execute_model( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, + **model_kwargs, ) if self.use_aux_hidden_state_outputs: @@ -1563,7 +1564,7 @@ def _dummy_run( hidden_states = outputs logit_indices = np.cumsum(num_scheduled_tokens) - 1 - return hidden_states[logit_indices] + return hidden_states, hidden_states[logit_indices], num_reqs @torch.inference_mode() def _dummy_sampler_run( From bf3033def0c316c468a771ab1528c6b14e262925 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 30 Apr 2025 15:55:43 -0300 Subject: [PATCH 17/63] Fix missing qwen embedding model param Signed-off-by: Max de Bayser --- vllm/model_executor/models/qwen2.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index f76f31c9fc8d..4764d86a1cd8 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -535,8 +535,10 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - return self.model(input_ids, positions, intermediate_tensors) + return self.model(input_ids, positions, intermediate_tensors, + inputs_embeds) def pooler( self, From 67bf72784612e2f66424425816ae4d1a7b7ea71d Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 1 May 2025 12:14:40 -0300 Subject: [PATCH 18/63] Make pooling params reach the pooling in V1 Signed-off-by: Max de Bayser --- vllm/model_executor/layers/pooler.py | 20 ++++++++++++-------- vllm/v1/pool/metadata.py | 3 +++ vllm/v1/worker/gpu_input_batch.py | 16 +++++++++++++--- vllm/v1/worker/gpu_model_runner.py | 8 +++++--- 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 5a333fe2c095..e3888d730c13 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -241,18 +241,22 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], pooling_metadata: PoolingMetadata): if isinstance(pooling_metadata, V0PoolingMetadata): - # TODO: enable matryoshka for V1 dimensions_list = [ pooling_param.dimensions for _, pooling_param in pooling_metadata.seq_groups ] - if any(d is not None for d in dimensions_list): - # change the output dimension - assert len(pooled_data) == len(dimensions_list) - pooled_data = [ - vecs if d is None else vecs[..., :d] - for vecs, d in zip(pooled_data, dimensions_list) - ] + else: + dimensions_list = [ + pooling_param.dimensions + for pooling_param in pooling_metadata.pooling_params + ] + if any(d is not None for d in dimensions_list): + # change the output dimension + assert len(pooled_data) == len(dimensions_list) + pooled_data = [ + vecs if d is None else vecs[..., :d] + for vecs, d in zip(pooled_data, dimensions_list) + ] if self.normalize: if isinstance(pooled_data, list): diff --git a/vllm/v1/pool/metadata.py b/vllm/v1/pool/metadata.py index 51eb1d97f02d..d70a0d044661 100644 --- a/vllm/v1/pool/metadata.py +++ b/vllm/v1/pool/metadata.py @@ -4,6 +4,8 @@ import torch +from vllm.pooling_params import PoolingParams + @dataclass class PoolingMetadata: @@ -11,3 +13,4 @@ class PoolingMetadata: prompt_lens: torch.Tensor prompt_token_ids: Optional[torch.Tensor] + pooling_params: list[PoolingParams] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 9331149c9115..1e6c3b34b92a 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -228,7 +228,7 @@ def __init__( # This is updated each time the batch constituents change. self.sampling_metadata = self._make_sampling_metadata() - self.pooling_reqs: set[str] = set() + self.pooling_params: dict[str, PoolingParams] = {} @property def token_type_ids_cpu(self) -> np.ndarray: @@ -367,7 +367,8 @@ def add_request( self.bad_words_token_ids[ req_index] = sampling_params.bad_words_token_ids else: - self.pooling_reqs.add(req_id) + assert request.pooling_params is not None + self.pooling_params[req_id] = request.pooling_params # Add request lora ID if request.lora_request: @@ -420,7 +421,7 @@ def remove_request(self, req_id: str) -> Optional[int]: # False means we don't fill with -inf. self.allowed_token_ids_mask_cpu_tensor[req_index].fill_(False) self.bad_words_token_ids.pop(req_index, None) - self.pooling_reqs.discard(req_id) + self.pooling_params.pop(req_id, None) return req_index def swap_states(self, i1: int, i2: int) -> None: @@ -635,10 +636,19 @@ def _make_sampling_metadata(self) -> SamplingMetadata: @property def pooling_metadata(self) -> PoolingMetadata: + + # Note, for now this assumes that all request in the batch + # are either sampling or pooling requests + assert len(self.req_ids) == len(self.pooling_params) + pooling_params = [ + self.pooling_params[req_id] for req_id in self.req_ids + ] + return PoolingMetadata( prompt_lens=torch.from_numpy( self.num_prompt_tokens[:self.num_reqs]).to(self.device), prompt_token_ids=self.sampling_metadata.prompt_token_ids, + pooling_params=pooling_params, ) def _make_prompt_token_ids_tensor(self) -> torch.Tensor: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 40c1c46054f7..24e2791b2cdd 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -24,6 +24,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.multimodal.utils import group_mm_inputs_by_modality +from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, @@ -1151,9 +1152,9 @@ def execute_model( sample_hidden_states = hidden_states[logits_indices] - if self.input_batch.pooling_reqs: + if self.input_batch.pooling_params: assert self.input_batch.num_reqs ==\ - len(self.input_batch.pooling_reqs), \ + len(self.input_batch.pooling_params), \ "Either all or none of the requests in" \ " a batch must be pooling request" @@ -1664,7 +1665,8 @@ def _dummy_pooler_run( device=self.device), prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), dtype=torch.int32, - device=self.device)) + device=self.device), + pooling_params=[PoolingParams()] * num_reqs) try: pooler_output = self.model.pooler(hidden_states=hidden_states, From bad4211a9244f7ede96d9e7a56c709b94f270344 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Sat, 10 May 2025 13:07:15 -0300 Subject: [PATCH 19/63] fix merge problems Signed-off-by: Max de Bayser --- vllm/model_executor/models/bert.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 3af1769f9f02..6a660f6d8d41 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -437,6 +437,8 @@ def forward( attn_metadata = get_forward_context().attn_metadata seq_lens = None if attn_metadata is not None: # Can be None during warmup + if isinstance(attn_metadata, dict): + attn_metadata = next(iter(attn_metadata.values())) seq_lens = getattr(attn_metadata, "seq_lens_tensor", attn_metadata.seq_lens) assert seq_lens is not None From 7c5be8817e3725c400388800a92284846a8a9960 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 15 May 2025 13:01:08 -0300 Subject: [PATCH 20/63] fix merge problem Signed-off-by: Max de Bayser --- vllm/v1/core/sched/scheduler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 9dadedaff5c4..c8578db7711c 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -782,6 +782,8 @@ def update_from_output( # Get prompt logprobs for this request. prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id) if new_token_ids or kv_transfer_params: + pooler_output = pooler_outputs[req_index] \ + if pooler_outputs else None # Add EngineCoreOutput for this Request. outputs.append( @@ -791,7 +793,7 @@ def update_from_output( finish_reason=request.get_finished_reason(), new_logprobs=new_logprobs, new_prompt_logprobs_tensors=prompt_logprobs_tensors, - pooling_output=pooler_outputs[req_index], + pooling_output=pooler_output, stop_reason=request.stop_reason, events=request.take_events(), kv_transfer_params=kv_transfer_params, From 6aa204c2786704b59a34d41edf2bdde65be3dfa5 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 4 Jun 2025 13:25:28 -0300 Subject: [PATCH 21/63] backport changes from the other PR Signed-off-by: Max de Bayser --- tests/entrypoints/llm/test_encode.py | 24 +++++- vllm/config.py | 41 +++++++++-- vllm/forward_context.py | 7 +- vllm/model_executor/layers/pooler.py | 98 ++++++++++++++++++------- vllm/model_executor/models/bert.py | 20 ++--- vllm/model_executor/models/roberta.py | 60 +++++++-------- vllm/v1/core/kv_cache_manager.py | 3 +- vllm/v1/core/kv_cache_utils.py | 1 + vllm/v1/core/sched/scheduler.py | 18 +++-- vllm/v1/core/sched/utils.py | 14 +++- vllm/v1/engine/async_llm.py | 2 +- vllm/v1/engine/core.py | 20 ++++- vllm/v1/kv_cache_interface.py | 1 + vllm/v1/metrics/stats.py | 1 - vllm/v1/outputs.py | 2 +- vllm/v1/request.py | 3 +- vllm/v1/sample/metadata.py | 7 +- vllm/v1/structured_output/__init__.py | 4 +- vllm/v1/worker/gpu_model_runner.py | 92 ++++++++++++++++------- vllm/v1/worker/tpu_model_runner.py | 2 + vllm/worker/cpu_pooling_model_runner.py | 9 ++- vllm/worker/pooling_model_runner.py | 9 ++- 22 files changed, 308 insertions(+), 130 deletions(-) diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index f0fa54aa3131..b930f05bebd0 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -8,6 +8,8 @@ from vllm import LLM, PoolingParams, PoolingRequestOutput from vllm.distributed import cleanup_dist_env_and_memory +from ...models.utils import check_embeddings_close + MODEL_NAME = "intfloat/multilingual-e5-small" PROMPTS = [ @@ -27,6 +29,14 @@ ] +@pytest.fixture(autouse=True) +def v1(run_with_both_engines): + # Simple autouse wrapper to run both engines for each test + # This can be promoted up to conftest.py to run for every + # test in a package + pass + + @pytest.fixture(scope="module") def llm(): # pytest caches the fixture so we use weakref.proxy to @@ -46,9 +56,15 @@ def llm(): cleanup_dist_env_and_memory() -def assert_outputs_equal(o1: list[PoolingRequestOutput], +def assert_outputs_match(o1: list[PoolingRequestOutput], o2: list[PoolingRequestOutput]): - assert [o.outputs for o in o1] == [o.outputs for o in o2] + check_embeddings_close( + embeddings_0_lst=[o.outputs.data for o in o1], + embeddings_1_lst=[o.outputs.data for o in o2], + name_0="hf", + name_1="vllm", + tol=1e-2, + ) @pytest.mark.skip_global_cleanup @@ -63,7 +79,7 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, v2_output = llm.encode({"prompt_token_ids": prompt_token_ids}, pooling_params=pooling_params) - assert_outputs_equal(v1_output, v2_output) + assert_outputs_match(v1_output, v2_output) @pytest.mark.skip_global_cleanup @@ -80,7 +96,7 @@ def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): } for p in TOKEN_IDS], pooling_params=pooling_params, ) - assert_outputs_equal(v1_output, v2_output) + assert_outputs_match(v1_output, v2_output) @pytest.mark.skip_global_cleanup diff --git a/vllm/config.py b/vllm/config.py index 1c995a588334..bf61379fa18d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -703,12 +703,12 @@ def _get_encoder_config(self): def _init_pooler_config(self) -> Optional["PoolerConfig"]: if self.runner_type == "pooling": - logger.warning("CUDA graph is not supported for pooling yet, " - "fallback to the eager mode.") - self.enforce_eager = True if isinstance(self.override_pooler_config, dict): self.override_pooler_config = PoolerConfig( **self.override_pooler_config) + logger.warning("CUDA graph is not supported for pooling yet, " + "fallback to the eager mode.") + self.enforce_eager = True pooler_config = self.override_pooler_config or PoolerConfig() @@ -4442,14 +4442,45 @@ def __post_init__(self): "Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION + disable_cascade_reasons: list[str] = [] + if self.compilation_config.full_cuda_graph and \ not self.model_config.disable_cascade_attn: - logger.warning_once( + disable_cascade_reasons.append( "full_cuda_graph is not supported with " "cascade attention. Disabling cascade attention.") - self.model_config.disable_cascade_attn = True self.cache_config.enable_prefix_caching = False + disable_chunked_prefill_reasons: list[str] = [] + + if self.model_config and self.model_config.pooler_config: + pooling_type = self.model_config.pooler_config.pooling_type + if pooling_type is None or pooling_type.lower() != "last": + disable_chunked_prefill_reasons.append( + "Only \"last\" pooling supports chunked " + "prefill and prefix caching; disabling both.") + + disable_cascade_reasons.append( + "Loaded model for pooling; disabling cascade attention.") + + if disable_chunked_prefill_reasons: + for reason in disable_chunked_prefill_reasons: + logger.info(reason) + self.scheduler_config.enable_chunked_prefill = False + self.scheduler_config.chunked_prefill_enabled = False + self.scheduler_config.long_prefill_token_threshold = 0 + self.scheduler_config.max_num_batched_tokens = max( + self.scheduler_config.max_model_len, + DEFAULT_MAX_NUM_BATCHED_TOKENS) + + if self.cache_config is not None: + self.cache_config.enable_prefix_caching = False + + if disable_cascade_reasons: + for reason in disable_cascade_reasons: + logger.info(reason) + self.model_config.disable_cascade_attn = True + if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): diff --git a/vllm/forward_context.py b/vllm/forward_context.py index f3b0518a44e0..21ff4196f041 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -94,6 +94,7 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None + seq_lens: Optional[torch.Tensor] = None _forward_context: Optional[ForwardContext] = None @@ -112,7 +113,8 @@ def set_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, num_tokens: Optional[int] = None, - num_tokens_across_dp: Optional[torch.Tensor] = None): + num_tokens_across_dp: Optional[torch.Tensor] = None, + seq_lens: Optional[torch.Tensor] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -135,7 +137,8 @@ def set_forward_context(attn_metadata: Any, static_forward_context, virtual_engine=virtual_engine, attn_metadata=attn_metadata, - dp_metadata=dp_metadata) + dp_metadata=dp_metadata, + seq_lens=seq_lens) try: yield diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index b12360d532ae..84ae3c79d44f 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -79,17 +79,18 @@ def __init__(self, *, normalize: bool, softmax: bool) -> None: def get_prompt_lens( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> torch.Tensor: if isinstance(pooling_metadata, V1PoolingMetadata): return pooling_metadata.prompt_lens + assert isinstance(hidden_states, torch.Tensor) return PoolingTensors.from_pooling_metadata( pooling_metadata, hidden_states.device).prompt_lens def extract_states( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: raise NotImplementedError @@ -99,7 +100,7 @@ def build_output(self, data: torch.Tensor) -> PoolingSequenceGroupOutput: def forward( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> PoolerOutput: pooled_data = self.extract_states(hidden_states, pooling_metadata) @@ -112,11 +113,19 @@ class CLSPool(SimplePooler): def extract_states( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + if isinstance(hidden_states, list): + result = [] + for req_state, prompt_len in zip(hidden_states, prompt_lens): + assert prompt_len == req_state.shape[0], \ + "partial prefill not supported with CLS pooling" + result.append(req_state[0]) + return result + first_token_flat_indices = torch.zeros_like(prompt_lens) first_token_flat_indices[1:] += torch.cumsum(prompt_lens, dim=0)[:-1] return hidden_states[first_token_flat_indices] @@ -126,9 +135,12 @@ class LastPool(SimplePooler): def extract_states( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: + if isinstance(hidden_states, list): + return [h[-1] for h in hidden_states] + prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) last_token_flat_indices = torch.cumsum(prompt_lens, dim=0) - 1 @@ -139,11 +151,17 @@ class AllPool(SimplePooler): def extract_states( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + if isinstance(hidden_states, list): + for req_state, prompt_len in zip(hidden_states, prompt_lens): + assert prompt_len == req_state.shape[0], \ + "partial prefill not supported with ALL pooling" + return hidden_states + offset = 0 pooled_data = list[torch.Tensor]() for prompt_len in prompt_lens: @@ -157,11 +175,19 @@ class MeanPool(SimplePooler): def extract_states( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + if isinstance(hidden_states, list): + result = [] + for req_state, prompt_len in zip(hidden_states, prompt_lens): + assert prompt_len == req_state.shape[0], \ + "partial prefill not supported with mean pooling" + result.append(torch.mean(req_state, dim=0)) + return result + cumsum = torch.cumsum(hidden_states, dim=0) start_indices = torch.cat([ torch.tensor([0], device=hidden_states.device), @@ -203,28 +229,37 @@ def get_prompt_token_ids( def extract_states( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> Union[list[torch.Tensor], torch.Tensor]: prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) prompt_token_ids = self.get_prompt_token_ids(pooling_metadata) - returned_token_ids = self.returned_token_ids - if returned_token_ids is not None and len(returned_token_ids) > 0: - hidden_states = hidden_states[:, returned_token_ids] + pooled_data: list[torch.Tensor] = [] + if isinstance(hidden_states, list): + for req_state, prompt_len in zip(hidden_states, prompt_lens): + assert prompt_len == req_state.shape[0], \ + "partial prefill not supported with mean pooling" + pooled_data = hidden_states + else: + offset = 0 + for prompt_len in prompt_lens: + pooled_data_i = hidden_states[offset:offset + prompt_len] + offset += prompt_len + pooled_data.append(pooled_data_i) + + pooled_data = [] + returned_token_ids = self.returned_token_ids step_tag_id = self.step_tag_id - offset = 0 - pooled_data = list[torch.Tensor]() - for i, prompt_len in enumerate(prompt_lens): - pooled_data_i = hidden_states[offset:offset + prompt_len] - if step_tag_id is not None: - pooled_data_i = pooled_data_i[prompt_token_ids[i] == - step_tag_id] + for data, token_id in zip(pooled_data, prompt_token_ids): + if returned_token_ids is not None and len(returned_token_ids) > 0: + data = data[:, returned_token_ids] - offset += prompt_len - pooled_data.append(pooled_data_i) + if step_tag_id is not None: + data = data[token_id == step_tag_id] + pooled_data.append(data) return pooled_data @@ -246,6 +281,7 @@ def forward(self, pooled_data: Union[list[torch.Tensor], torch.Tensor], for _, pooling_param in pooling_metadata.seq_groups ] else: + assert isinstance(pooled_data, list) dimensions_list = [ pooling_param.dimensions for pooling_param in pooling_metadata.pooling_params @@ -343,26 +379,39 @@ def __init__( def get_prompt_lens( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> torch.Tensor: if isinstance(pooling_metadata, V1PoolingMetadata): return pooling_metadata.prompt_lens + assert isinstance(hidden_states, torch.Tensor) return PoolingTensors.from_pooling_metadata( pooling_metadata, hidden_states.device).prompt_lens def forward( self, - hidden_states: torch.Tensor, + hidden_states: Union[torch.Tensor, list[torch.Tensor]], pooling_metadata: PoolingMetadata, ) -> PoolerOutput: """Pools sentence pair scores from the hidden_states.""" prompt_lens = self.get_prompt_lens(hidden_states, pooling_metadata) + pooled_data = list[torch.Tensor]() + if isinstance(hidden_states, list): + for req_state, prompt_len in zip(hidden_states, prompt_lens): + assert prompt_len == req_state.shape[0], \ + "partial prefill not supported with classifier" + pooled_data = hidden_states + else: + offset = 0 + for prompt_len in prompt_lens: + pooled_data_i = hidden_states[offset:offset + prompt_len] + offset += prompt_len + pooled_data.append(pooled_data_i) + offset = 0 pooled_data_lst = [] - for prompt_len in prompt_lens: - pooled_data_i = hidden_states[offset:offset + prompt_len] + for pooled_data_i in pooled_data: if self.pooler is not None: final_shape_tensor = self.pooler(pooled_data_i) @@ -370,7 +419,6 @@ def forward( final_shape_tensor = self.classifier(pooled_data_i) pooled_data_lst.append(final_shape_tensor) - offset += prompt_len pooled_output = torch.stack(pooled_data_lst) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index a86f948992c7..62ab69eae122 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -343,18 +343,12 @@ def forward( hidden_states = inputs_embeds else: attn_metadata = get_forward_context().attn_metadata - seq_lens = None - if attn_metadata is not None: # Can be None during warmup - if isinstance(attn_metadata, dict): - attn_metadata = next(iter(attn_metadata.values())) - seq_lens = getattr(attn_metadata, "seq_lens_tensor", - attn_metadata.seq_lens) - assert seq_lens is not None - hidden_states = self.embeddings(input_ids=input_ids, - seq_lens=seq_lens, - position_ids=position_ids, - token_type_ids=token_type_ids) - + assert hasattr(attn_metadata, "seq_lens_tensor") + hidden_states = self.embeddings( + input_ids=input_ids, + seq_lens=attn_metadata.seq_lens_tensor, + position_ids=position_ids, + token_type_ids=token_type_ids) return self.encoder(hidden_states) def load_weights(self, weights: Iterable[tuple[str, @@ -417,11 +411,13 @@ def forward( self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, + token_type_ids: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids=input_ids, position_ids=positions, + token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 082acc9ac129..ac9dd4dcca77 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -10,6 +10,7 @@ from transformers import RobertaConfig from vllm.config import VllmConfig +from vllm.forward_context import get_forward_context from vllm.model_executor.layers.pooler import ClassifierPooler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -52,40 +53,41 @@ def __init__(self, config: RobertaConfig): def forward( self, input_ids: torch.Tensor, - seq_lens: torch.Tensor, position_ids: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: + input_shape = input_ids.size() inputs_embeds = self.word_embeddings(input_ids) - - if seq_lens is not None: # Can be None during warmup - # Replace position ids because in RoBERTa models - # they have to start at padding_idx + 1 and ignore - # existing padding tokens - # References: - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - pos_list = [] - token_list = [] - offset = 0 - for seq_len in seq_lens: - pos_list.append(position_ids[offset:offset + seq_len]) - token_list.append(input_ids[offset:offset + seq_len]) - offset += seq_len - - new_pos_list = [] - for positions, tokens in zip(pos_list, token_list): - # Verify assumption that incoming position are - # always a sequence from 0 to N. - expected_pos = torch.arange(positions.size()[0], - dtype=torch.long, - device=inputs_embeds.device) - assert torch.equal(positions, expected_pos) - new_pos_list.append( - create_position_ids_from_input_ids(tokens, - self.padding_idx)) - position_ids = torch.cat(new_pos_list) + seq_lens = get_forward_context().seq_lens + + # Replace position ids because in RoBERTa models + # they have to start at padding_idx + 1 and ignore + # existing padding tokens + # References: + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 + pos_list = [] + token_list = [] + offset = 0 + for seq_len in seq_lens: + pos_list.append(position_ids[offset:offset + seq_len]) + token_list.append(input_ids[offset:offset + seq_len]) + offset += seq_len + + offset = 0 + for positions, tokens in zip(pos_list, token_list): + # Verify assumption that incoming position are + # always a sequence from 0 to N. + expected_pos = torch.arange(positions.size()[0], + dtype=torch.long, + device=inputs_embeds.device) + assert torch.equal(positions, expected_pos) + new_pos = create_position_ids_from_input_ids( + tokens, self.padding_idx) + seq_len = new_pos.shape[0] + position_ids[offset:offset + seq_len] = new_pos + offset += seq_len # Position embeddings. position_embeddings = self.position_embeddings(position_ids) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 75ac897e8346..522da770ebf4 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -133,8 +133,7 @@ def get_computed_blocks(self, # When the request requires prompt logprobs, we skip prefix caching. if (not self.enable_caching or (request.sampling_params is not None - and request.sampling_params.prompt_logprobs is not None) - or request.pooling_params is not None): + and request.sampling_params.prompt_logprobs is not None)): return KVCacheBlocks.create_empty(), 0 # The block hashes for the request may already be computed diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index ad3c21f794b9..53380b90620b 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -710,6 +710,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): dtype=spec.dtype, use_mla=spec.use_mla, sliding_window=spec.sliding_window, + attn_type=str(spec.attn_type), ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 80ad04363311..c9ae88af10c8 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -721,7 +721,8 @@ def update_from_output( continue req_index = model_runner_output.req_id_to_index[req_id] - generated_token_ids = sampled_token_ids[req_index] + generated_token_ids = sampled_token_ids[ + req_index] if sampled_token_ids else [] scheduled_spec_token_ids = ( scheduler_output.scheduled_spec_decode_tokens.get(req_id)) @@ -773,8 +774,16 @@ def update_from_output( del new_token_ids[num_new:] # Trim new tokens if needed. break + pooler_output = None + if pooler_outputs: + pooler_output = pooler_outputs[req_index] + stopped = check_stop(request, self.max_model_len, + pooler_output) + if stopped: + kv_transfer_params = self._free_request(request) + # Extract sample logprobs if needed. - if request.sampling_params \ + if request.sampling_params is not None \ and request.sampling_params.logprobs is not None and logprobs: # NOTE: once we support N tokens per step (spec decode), # the outer lists can be of length > 1. @@ -800,9 +809,8 @@ def update_from_output( # Get prompt logprobs for this request. prompt_logprobs_tensors = prompt_logprobs_dict.get(req_id) - if new_token_ids or kv_transfer_params: - pooler_output = pooler_outputs[req_index] \ - if pooler_outputs else None + if new_token_ids or pooler_output is not None \ + or kv_transfer_params: # Add EngineCoreOutput for this Request. outputs[request.client_index].append( diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index daf7104e7b36..01c0ef402ae2 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -1,16 +1,26 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Optional + +import torch + from vllm.v1.request import Request, RequestStatus -def check_stop(request: Request, max_model_len: int) -> bool: +def check_stop(request: Request, + max_model_len: int, + pooler_output: Optional[torch.Tensor] = None) -> bool: if (request.num_tokens >= max_model_len or request.num_output_tokens >= request.max_tokens): request.status = RequestStatus.FINISHED_LENGTH_CAPPED return True - last_token_id = request.output_token_ids[-1] + if request.pooling_params and pooler_output is not None: + request.status = RequestStatus.FINISHED_STOPPED + return True + if (sampling_params := request.sampling_params) is not None: + last_token_id = request.output_token_ids[-1] if (not sampling_params.ignore_eos and last_token_id == request.eos_token_id): request.status = RequestStatus.FINISHED_STOPPED diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 673ddd6030dc..ecfe5be6d50c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -495,7 +495,7 @@ async def encode( # Note: drain queue without await if possible (avoids # task switching under load which helps performance). out = q.get_nowait() or await q.get() - assert type(out) is PoolingRequestOutput + assert isinstance(out, PoolingRequestOutput) # Note: both OutputProcessor and EngineCore handle their # own request cleanup based on finished. finished = out.finished diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 1576f3bf9e8f..c1368877eb87 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -36,7 +36,7 @@ EngineCoreRequestType, UtilityOutput) from vllm.v1.engine.mm_input_cache import MirroredProcessingCache from vllm.v1.executor.abstract import Executor -from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -149,6 +149,24 @@ def _initialize_kv_caches( for kv_cache_spec_one_worker, available_gpu_memory_one_worker in zip(kv_cache_specs, available_gpu_memory) ] + + for kv_cache_spec_one_worker in kv_cache_specs: + for _, spec in kv_cache_spec_one_worker.items(): + if isinstance(spec, AttentionSpec) and \ + spec.attn_type != "decoder": + + logger.info("Found non-decoder layer. Disabling " + "prefix cache and chunked prefill") + self.vllm_config.cache_config.\ + enable_prefix_caching = False + self.vllm_config.scheduler_config.\ + enable_chunked_prefill = False + self.vllm_config.scheduler_config.\ + chunked_prefill_enabled = False + self.vllm_config.scheduler_config.\ + long_prefill_token_threshold = 0 + break + # Since we use a shared centralized controller, we need the # `kv_cache_config` to be consistent across all workers to make sure # all the memory operators can be applied to all workers. diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index cf2eb3b95569..76d604a26376 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -74,6 +74,7 @@ class AttentionSpec(KVCacheSpec): head_size: int dtype: torch.dtype use_mla: bool + attn_type: str @property def page_size_bytes(self) -> int: diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 50c8b07fe54d..a79d2aabd512 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -106,7 +106,6 @@ def update_from_output(self, output: "EngineCoreOutput", self.num_generation_tokens += num_new_generation_tokens if is_prefilling: - assert num_new_generation_tokens > 0 self.num_prompt_tokens += prompt_len first_token_latency = self._time_since(req_stats.arrival_time) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 8df3bc575516..6a55dda2ee06 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -101,7 +101,7 @@ class ModelRunnerOutput: # [prompt_len] prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] - pooler_output: list[torch.Tensor] + pooler_output: list[Optional[torch.Tensor]] # [req_ids] finished_sending: Optional[set[str]] = None diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 74cca1d2f394..c6b666d7e968 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -119,7 +119,8 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": eos_token_id=request.eos_token_id, lora_request=request.lora_request, structured_output_request=StructuredOutputRequest( - sampling_params=request.sampling_params), + sampling_params=request.sampling_params) \ + if request.sampling_params else None, cache_salt=request.cache_salt, ) diff --git a/vllm/v1/sample/metadata.py b/vllm/v1/sample/metadata.py index a8271e2df5f9..ab13b288a5a9 100644 --- a/vllm/v1/sample/metadata.py +++ b/vllm/v1/sample/metadata.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from typing import Any, Optional +from typing import Optional import torch @@ -42,8 +42,3 @@ class SamplingMetadata: # req_index -> bad_words_token_ids bad_words_token_ids: dict[int, list[list[int]]] - - -@dataclass -class PoolingMetadata: - additional_data: Optional[Any] = None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 2d9c97e22de8..c5500b9a384d 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -62,8 +62,8 @@ def grammar_init(self, request: Request) -> None: return if TYPE_CHECKING: - assert request.sampling_params is not None - assert request.sampling_params.guided_decoding is not None + assert request.sampling_params is not None and \ + request.sampling_params.guided_decoding is not None # Initialize the backend the first time it is needed. # diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index aa7d0fbeb9ef..01394b4992de 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -561,7 +561,8 @@ def _get_cumsum_and_arange( def _prepare_inputs( self, scheduler_output: "SchedulerOutput", - ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata]]: + ) -> tuple[dict[str, Any], torch.Tensor, Optional[SpecDecodeMetadata], + np.ndarray]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens assert total_num_scheduled_tokens > 0 num_reqs = self.input_batch.num_reqs @@ -736,7 +737,8 @@ def _prepare_inputs( if self.lora_config: self.set_active_loras(self.input_batch, num_scheduled_tokens) - return attn_metadata, logits_indices, spec_decode_metadata + return attn_metadata, logits_indices, \ + spec_decode_metadata, num_scheduled_tokens def _compute_cascade_attn_prefix_len( self, @@ -1195,7 +1197,8 @@ def execute_model( return self.kv_connector_no_forward(scheduler_output) # Prepare the decoder inputs. - attn_metadata, logits_indices, spec_decode_metadata = ( + attn_metadata, logits_indices, \ + spec_decode_metadata, num_scheduled_tokens_np = ( self._prepare_inputs(scheduler_output)) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph @@ -1310,30 +1313,46 @@ def execute_model( all_gather_group=get_tp_group()) logits = None else: - if self.input_batch.pooling_params: assert self.input_batch.num_reqs ==\ len(self.input_batch.pooling_params), \ "Either all or none of the requests in" \ " a batch must be pooling request" - pooler_output = self.model.pooler( - hidden_states=hidden_states, - pooling_metadata=self.input_batch.pooling_metadata) + offset = 0 + extracted_hidden_states = list[torch.Tensor]() + for seq_len in num_scheduled_tokens_np: + extracted_hidden_states.append( + hidden_states[offset:offset + seq_len]) + offset += seq_len + + pooling_metadata = self.input_batch.pooling_metadata - # any token will do because max tokens is 1 - sampled_tokens = [[0]] * self.input_batch.num_reqs + raw_pooler_output = self.model.pooler( + hidden_states=extracted_hidden_states, + pooling_metadata=pooling_metadata) + + pooler_output: list[Optional[torch.Tensor]] = [] + seq_lens = self.seq_lens[:self.input_batch.num_reqs] + for raw_output, seq_len, prompt_len in zip( + raw_pooler_output, seq_lens, + pooling_metadata.prompt_lens): + + if seq_len == prompt_len: + pooler_output.append(raw_output.data.to("cpu")) + else: + pooler_output.append(None) return ModelRunnerOutput( req_ids=self.input_batch.req_ids, req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=sampled_tokens, + sampled_token_ids=[], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, - pooler_output=[ - o.data.to("cpu") for o in pooler_output.outputs - ], + pooler_output=pooler_output, + finished_sending=finished_sending, + finished_recving=finished_recving, ) sample_hidden_states = hidden_states[logits_indices] @@ -1784,7 +1803,7 @@ def _dummy_run( self, num_tokens: int, skip_attn: bool = True, - ) -> torch.Tensor: + ) -> tuple[torch.Tensor, torch.Tensor, int, np.ndarray]: # Padding for DP num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) @@ -1804,6 +1823,10 @@ def _dummy_run( num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) + self.seq_lens.fill_(0) + seq_lens = self.seq_lens[:num_reqs] + seq_lens.copy_(torch.from_numpy(num_scheduled_tokens)) + if skip_attn: attn_metadata: Optional[dict[str, Any]] = None else: @@ -1859,17 +1882,19 @@ def _dummy_run( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) - with set_forward_context( - attn_metadata, - self.vllm_config, - num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp): + with set_forward_context(attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + seq_lens=seq_lens): outputs = model( input_ids=input_ids, positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, ) + + positions = self.positions[:num_tokens].zero_() if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: @@ -1880,7 +1905,8 @@ def _dummy_run( self.drafter.dummy_run(num_tokens) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - return hidden_states, hidden_states[logit_indices], num_reqs + return hidden_states, hidden_states[ + logit_indices], num_reqs, num_scheduled_tokens @torch.inference_mode() def _dummy_sampler_run( @@ -1965,12 +1991,21 @@ def _dummy_pooler_run( num_tokens: int, num_reqs: int, hidden_states: torch.Tensor, + num_scheduled_tokens: np.ndarray, ) -> torch.Tensor: + num_reqs = num_scheduled_tokens.shape[0] + + offset = 0 + hidden_states_list = list[torch.Tensor]() + for seq_len in num_scheduled_tokens: + hidden_states_list.append(hidden_states[offset:offset + seq_len]) + offset += seq_len + req_num_tokens = num_tokens // num_reqs dummy_metadata = PoolingMetadata( - prompt_lens=torch.tensor([req_num_tokens] * num_reqs, + prompt_lens=torch.tensor([h.shape[0] for h in hidden_states_list], device=self.device), prompt_token_ids=torch.zeros((num_reqs, req_num_tokens), dtype=torch.int32, @@ -1978,7 +2013,7 @@ def _dummy_pooler_run( pooling_params=[PoolingParams()] * num_reqs) try: - pooler_output = self.model.pooler(hidden_states=hidden_states, + pooler_output = self.model.pooler(hidden_states=hidden_states_list, pooling_metadata=dummy_metadata) except RuntimeError as e: if 'out of memory' in str(e): @@ -2060,12 +2095,13 @@ def profile_run(self) -> None: # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) - hidden_states, last_hidden_states, num_reqs = self._dummy_run( - self.max_num_tokens) + hidden_states, last_hidden_states, num_reqs, num_scheduled_tokens \ + = self._dummy_run(self.max_num_tokens) if get_pp_group().is_last_rank: if self.is_pooling_model: output = self._dummy_pooler_run(self.max_num_tokens, num_reqs, - hidden_states) + hidden_states, + num_scheduled_tokens) else: output = self._dummy_sampler_run(last_hidden_states) else: @@ -2303,14 +2339,16 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: head_size=attn_module.head_size, dtype=self.kv_cache_dtype, sliding_window=attn_module.sliding_window, - use_mla=use_mla) + use_mla=use_mla, + attn_type=str(attn_module.attn_type)) else: kv_cache_spec[layer_name] = FullAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, head_size=attn_module.head_size, dtype=self.kv_cache_dtype, - use_mla=use_mla) + use_mla=use_mla, + attn_type=str(attn_module.attn_type)) elif attn_module.attn_type == AttentionType.ENCODER: # encoder attention does not need KV cache. continue diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 9339cdeac61c..e5066cc56e90 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -487,6 +487,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: dtype=attn_module.dtype, sliding_window=attn_module.sliding_window, use_mla=False, + attn_type=str(attn_module.attn_type), ) else: kv_cache_spec[layer_name] = FullAttentionSpec( @@ -495,6 +496,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: head_size=attn_module.head_size, dtype=attn_module.dtype, use_mla=False, + attn_type=str(attn_module.attn_type), ) elif attn_module.attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY): diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index 203fdf225a41..d3c4ff860c69 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -60,8 +60,13 @@ def execute_model( intermediate_tensors, } - with set_forward_context(model_input.attn_metadata, self.vllm_config, - model_input.virtual_engine): + assert model_input.seq_lens is not None + seq_lens_tensor = torch.tensor(model_input.seq_lens, dtype=torch.int32) + + with set_forward_context(model_input.attn_metadata, + self.vllm_config, + model_input.virtual_engine, + seq_lens=seq_lens_tensor): hidden_states = model_executable(**execute_model_kwargs) # Only perform pooling in the driver worker. diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index f80955f71a5a..dc8d70006969 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -114,8 +114,13 @@ def execute_model( if model_input.token_types is not None: cross_enc_kwargs["token_type_ids"] = model_input.token_types - with set_forward_context(model_input.attn_metadata, self.vllm_config, - virtual_engine): + assert model_input.seq_lens is not None + seq_lens_tensor = torch.tensor(model_input.seq_lens, dtype=torch.int32) + + with set_forward_context(model_input.attn_metadata, + self.vllm_config, + virtual_engine, + seq_lens=seq_lens_tensor): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, From e81470c12370de50ae95bafb752b8537a393038f Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 4 Jun 2025 17:42:24 -0300 Subject: [PATCH 22/63] fix merge errors Signed-off-by: Max de Bayser --- vllm/model_executor/models/bert.py | 12 +++--------- vllm/v1/worker/gpu_model_runner.py | 13 +++++++++++-- vllm/v1/worker/gpu_worker.py | 6 ++++-- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 62ab69eae122..5b98cbaafe79 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -12,7 +12,6 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, @@ -57,7 +56,6 @@ def __init__(self, config: BertConfig): def forward( self, input_ids: torch.Tensor, - seq_lens: torch.Tensor, position_ids: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -342,13 +340,9 @@ def forward( if inputs_embeds is not None: hidden_states = inputs_embeds else: - attn_metadata = get_forward_context().attn_metadata - assert hasattr(attn_metadata, "seq_lens_tensor") - hidden_states = self.embeddings( - input_ids=input_ids, - seq_lens=attn_metadata.seq_lens_tensor, - position_ids=position_ids, - token_type_ids=token_type_ids) + hidden_states = self.embeddings(input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids) return self.encoder(hidden_states) def load_weights(self, weights: Iterable[tuple[str, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 01394b4992de..06e6eb344814 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1273,12 +1273,15 @@ def execute_model( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True) - # Run the decoder. + seq_lens = self.seq_lens[:self.input_batch.num_reqs] + + # Run the model # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config, num_tokens=num_input_tokens, - num_tokens_across_dp=num_tokens_across_dp): + num_tokens_across_dp=num_tokens_across_dp, + seq_lens=seq_lens): self.maybe_setup_kv_connector(scheduler_output) model_output = self.model( @@ -1869,6 +1872,12 @@ def _dummy_run( else: positions = self.positions[:num_tokens] + offset = 0 + for seq_len in num_scheduled_tokens_list: + positions[offset:offset + seq_len] = torch.arange( + seq_len, dtype=positions.dtype) + offset += seq_len + if get_pp_group().is_first_rank: intermediate_tensors = None else: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 20894db41356..1523d7175508 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -253,11 +253,13 @@ def compile_or_warm_up_model(self) -> None: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) - hidden_states, last_hidden_states, num_reqs = \ + hidden_states, last_hidden_states, \ + num_reqs, num_scheduled_tokens = \ self.model_runner._dummy_run(num_tokens=max_num_reqs) if self.model_runner.is_pooling_model: self.model_runner._dummy_pooler_run(max_num_reqs, num_reqs, - hidden_states) + hidden_states, + num_scheduled_tokens) else: self.model_runner._dummy_sampler_run( hidden_states=last_hidden_states) From 20e7140cfe6d1c68245ecd47b6b0d3f211f82eb3 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 4 Jun 2025 18:35:43 -0300 Subject: [PATCH 23/63] address review comments Signed-off-by: Max de Bayser --- vllm/v1/engine/llm_engine.py | 6 ++++-- vllm/v1/engine/output_processor.py | 3 ++- vllm/v1/engine/processor.py | 34 +++++++++++++++--------------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 19e682245050..1932cd10bb1b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -198,7 +198,9 @@ def add_request( tokenization_kwargs, trace_headers, prompt_adapter_request, priority) - if not isinstance(params, SamplingParams) or (n := params.n) == 1: + n = params.n if isinstance(params, SamplingParams) else 1 + + if n == 1: # Make a new RequestState and queue. self.output_processor.add_request(request, prompt_str, None, 0) # Add the request to EngineCore. @@ -219,7 +221,7 @@ def add_request( # Add the request to EngineCore. self.engine_core.add_request(child_request) - def step(self) -> list[Union[RequestOutput, PoolingRequestOutput]]: + def step(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]: if self.should_execute_dummy_batch: self.should_execute_dummy_batch = False diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index a10ef2ddbe2d..2bcd61d1f0aa 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -368,7 +368,8 @@ def process_outputs( within the loop below. """ - request_outputs: list[Union[RequestOutput, PoolingRequestOutput]] = [] + request_outputs: Union[list[RequestOutput], + list[PoolingRequestOutput]] = [] reqs_to_abort: list[str] = [] for engine_core_output in engine_core_outputs: req_id = engine_core_output.request_id diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index c21347f19d4f..81f0bc936bac 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -257,6 +257,23 @@ def process_inputs( if encoder_inputs is not None: raise NotImplementedError + sampling_params = None + pooling_params = None + if isinstance(params, SamplingParams): + # TODO: can we avoid cloning here in multiproc case? + sampling_params = params.clone() + # If unset max tokens, then generate up to the max_model_len. + if sampling_params.max_tokens is None: + sampling_params.max_tokens = ( + self.model_config.max_model_len - + len(decoder_inputs["prompt_token_ids"])) + sampling_params.update_from_generation_config( + self.generation_config_fields, eos_token_id) + sampling_params.update_from_tokenizer( + self.tokenizer.get_lora_tokenizer(lora_request)) + else: + pooling_params = params.clone() + # Multimodal related. sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None sorted_mm_positions: Optional[list[PlaceholderRange]] = None @@ -305,23 +322,6 @@ def process_inputs( else: sorted_mm_inputs = orig_sorted_mm_inputs - sampling_params = None - pooling_params = None - if isinstance(params, SamplingParams): - # TODO: can we avoid cloning here in multiproc case? - sampling_params = params.clone() - # If unset max tokens, then generate up to the max_model_len. - if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) - else: - pooling_params = params.clone() - return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, prompt_token_ids=decoder_inputs["prompt_token_ids"], From 6bc1e3dbeb527f057957e78d24ecdb9f22f04c93 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 4 Jun 2025 19:04:42 -0300 Subject: [PATCH 24/63] address review comments Signed-off-by: Max de Bayser --- vllm/v1/outputs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 6a55dda2ee06..2234843293cc 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -101,6 +101,7 @@ class ModelRunnerOutput: # [prompt_len] prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]] + # [num_reqs, hidden_size] pooler_output: list[Optional[torch.Tensor]] # [req_ids] From 22825bd4da98c6cddfc19967db1706c42e69d010 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 4 Jun 2025 19:44:50 -0300 Subject: [PATCH 25/63] simplify PR Signed-off-by: Max de Bayser --- vllm/forward_context.py | 7 +-- vllm/model_executor/models/roberta.py | 72 ++++++++++++++----------- vllm/v1/worker/gpu_model_runner.py | 17 +++--- vllm/worker/cpu_pooling_model_runner.py | 9 +--- vllm/worker/pooling_model_runner.py | 9 +--- 5 files changed, 55 insertions(+), 59 deletions(-) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 21ff4196f041..f3b0518a44e0 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -94,7 +94,6 @@ class ForwardContext: virtual_engine: int # set dynamically for each forward pass # set dynamically for each forward pass dp_metadata: Optional[DPMetadata] = None - seq_lens: Optional[torch.Tensor] = None _forward_context: Optional[ForwardContext] = None @@ -113,8 +112,7 @@ def set_forward_context(attn_metadata: Any, vllm_config: VllmConfig, virtual_engine: int = 0, num_tokens: Optional[int] = None, - num_tokens_across_dp: Optional[torch.Tensor] = None, - seq_lens: Optional[torch.Tensor] = None): + num_tokens_across_dp: Optional[torch.Tensor] = None): """A context manager that stores the current forward context, can be attention metadata, etc. Here we can inject common logic for every model forward pass. @@ -137,8 +135,7 @@ def set_forward_context(attn_metadata: Any, static_forward_context, virtual_engine=virtual_engine, attn_metadata=attn_metadata, - dp_metadata=dp_metadata, - seq_lens=seq_lens) + dp_metadata=dp_metadata) try: yield diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ac9dd4dcca77..6be1a6ed5701 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -3,7 +3,7 @@ import itertools from collections.abc import Iterable -from typing import Optional, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -59,35 +59,47 @@ def forward( input_shape = input_ids.size() inputs_embeds = self.word_embeddings(input_ids) - seq_lens = get_forward_context().seq_lens - - # Replace position ids because in RoBERTa models - # they have to start at padding_idx + 1 and ignore - # existing padding tokens - # References: - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - pos_list = [] - token_list = [] - offset = 0 - for seq_len in seq_lens: - pos_list.append(position_ids[offset:offset + seq_len]) - token_list.append(input_ids[offset:offset + seq_len]) - offset += seq_len - - offset = 0 - for positions, tokens in zip(pos_list, token_list): - # Verify assumption that incoming position are - # always a sequence from 0 to N. - expected_pos = torch.arange(positions.size()[0], - dtype=torch.long, - device=inputs_embeds.device) - assert torch.equal(positions, expected_pos) - new_pos = create_position_ids_from_input_ids( - tokens, self.padding_idx) - seq_len = new_pos.shape[0] - position_ids[offset:offset + seq_len] = new_pos - offset += seq_len + attn_metadata = get_forward_context().attn_metadata + + seq_lens: Optional[torch.Tensor] = None + attn_metadata: Any = get_forward_context().attn_metadata + if attn_metadata is not None: + if isinstance(attn_metadata, dict): + attn_metadata = next(iter(attn_metadata.values())) + assert hasattr(attn_metadata, "seq_lens") + seq_lens = attn_metadata.seq_lens + else: + assert hasattr(attn_metadata, "seq_lens_tensor") + seq_lens = attn_metadata.seq_lens_tensor + + if seq_lens: + # Replace position ids because in RoBERTa models + # they have to start at padding_idx + 1 and ignore + # existing padding tokens + # References: + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 + pos_list = [] + token_list = [] + offset = 0 + for seq_len in seq_lens: + pos_list.append(position_ids[offset:offset + seq_len]) + token_list.append(input_ids[offset:offset + seq_len]) + offset += seq_len + + offset = 0 + for positions, tokens in zip(pos_list, token_list): + # Verify assumption that incoming position are + # always a sequence from 0 to N. + expected_pos = torch.arange(positions.size()[0], + dtype=torch.long, + device=inputs_embeds.device) + assert torch.equal(positions, expected_pos) + new_pos = create_position_ids_from_input_ids( + tokens, self.padding_idx) + seq_len = new_pos.shape[0] + position_ids[offset:offset + seq_len] = new_pos + offset += seq_len # Position embeddings. position_embeddings = self.position_embeddings(position_ids) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 06e6eb344814..c6c8ee3c4433 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1273,15 +1273,12 @@ def execute_model( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_input_tokens, intermediate_tensors, True) - seq_lens = self.seq_lens[:self.input_batch.num_reqs] - # Run the model # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config, num_tokens=num_input_tokens, - num_tokens_across_dp=num_tokens_across_dp, - seq_lens=seq_lens): + num_tokens_across_dp=num_tokens_across_dp): self.maybe_setup_kv_connector(scheduler_output) model_output = self.model( @@ -1891,11 +1888,11 @@ def _dummy_run( intermediate_tensors = self.sync_and_slice_intermediate_tensors( num_tokens, None, False) - with set_forward_context(attn_metadata, - self.vllm_config, - num_tokens=num_tokens, - num_tokens_across_dp=num_tokens_across_dp, - seq_lens=seq_lens): + with set_forward_context( + attn_metadata, + self.vllm_config, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp): outputs = model( input_ids=input_ids, positions=positions, @@ -2134,7 +2131,7 @@ def capture_model(self) -> None: # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. with graph_capture(device=self.device): - skip_attn = not self.vllm_config.compilation_config.full_cuda_graph + skip_attn = False for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py index d3c4ff860c69..203fdf225a41 100644 --- a/vllm/worker/cpu_pooling_model_runner.py +++ b/vllm/worker/cpu_pooling_model_runner.py @@ -60,13 +60,8 @@ def execute_model( intermediate_tensors, } - assert model_input.seq_lens is not None - seq_lens_tensor = torch.tensor(model_input.seq_lens, dtype=torch.int32) - - with set_forward_context(model_input.attn_metadata, - self.vllm_config, - model_input.virtual_engine, - seq_lens=seq_lens_tensor): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + model_input.virtual_engine): hidden_states = model_executable(**execute_model_kwargs) # Only perform pooling in the driver worker. diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py index dc8d70006969..f80955f71a5a 100644 --- a/vllm/worker/pooling_model_runner.py +++ b/vllm/worker/pooling_model_runner.py @@ -114,13 +114,8 @@ def execute_model( if model_input.token_types is not None: cross_enc_kwargs["token_type_ids"] = model_input.token_types - assert model_input.seq_lens is not None - seq_lens_tensor = torch.tensor(model_input.seq_lens, dtype=torch.int32) - - with set_forward_context(model_input.attn_metadata, - self.vllm_config, - virtual_engine, - seq_lens=seq_lens_tensor): + with set_forward_context(model_input.attn_metadata, self.vllm_config, + virtual_engine): hidden_or_intermediate_states = model_executable( input_ids=model_input.input_tokens, positions=model_input.input_positions, From c889b2e0a8a5339daa9e8961b5c080909edeed4e Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 4 Jun 2025 19:57:30 -0300 Subject: [PATCH 26/63] fix mistake Signed-off-by: Max de Bayser --- vllm/model_executor/models/roberta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 6be1a6ed5701..3c250819c8e9 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -72,7 +72,7 @@ def forward( assert hasattr(attn_metadata, "seq_lens_tensor") seq_lens = attn_metadata.seq_lens_tensor - if seq_lens: + if seq_lens is not None: # Replace position ids because in RoBERTa models # they have to start at padding_idx + 1 and ignore # existing padding tokens From 24462e42b72898ec67b26af464c93a46352ae663 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 6 Jun 2025 18:50:07 -0300 Subject: [PATCH 27/63] workaround qwen model test issue Signed-off-by: Max de Bayser --- tests/conftest.py | 3 +++ tests/models/language/pooling/test_embedding.py | 10 ++++++++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5ec3926bd31f..f4ebb2c6bd25 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -145,6 +145,7 @@ def run_with_both_engines(request, monkeypatch): # Automatically runs tests twice, once with V1 and once without use_v1 = request.param # Tests decorated with `@skip_v1` are only run without v1 + skip_v0 = request.node.get_closest_marker("skip_v0") skip_v1 = request.node.get_closest_marker("skip_v1") if use_v1: @@ -152,6 +153,8 @@ def run_with_both_engines(request, monkeypatch): pytest.skip("Skipping test on vllm V1") monkeypatch.setenv('VLLM_USE_V1', '1') else: + if skip_v0: + pytest.skip("Skipping test on vllm V0") monkeypatch.setenv('VLLM_USE_V1', '0') yield diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 15836211dd7a..b1e009fd29cc 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -28,13 +28,19 @@ def v1(run_with_both_engines): marks=[pytest.mark.core_model]), pytest.param("intfloat/e5-mistral-7b-instruct", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), + # the qwen models interfere with each other (see PR + # https://github.com/vllm-project/vllm/pull/18720). + # To avoid this problem, for now we skip v0 since it will be + # deprecated anyway. + pytest.param("ssmits/Qwen2-7B-Instruct-embed-base", + marks=[pytest.mark.skip_v0]), # [Encoder-only] pytest.param("BAAI/bge-base-en-v1.5", marks=[pytest.mark.core_model, pytest.mark.cpu_model]), pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("intfloat/multilingual-e5-small"), - pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"), + pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct", + marks=[pytest.mark.skip_v0]), # [Cross-Encoder] pytest.param("sentence-transformers/stsb-roberta-base-v2"), ], From 79d1b95af5c97fac156a0e986bd4a6934ce33fa5 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 6 Jun 2025 19:32:18 -0300 Subject: [PATCH 28/63] revert unecessary change Signed-off-by: Max de Bayser --- vllm/v1/engine/core.py | 78 ++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 49dd6fce40c8..191f892d01f5 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -142,51 +142,41 @@ def _initialize_kv_caches( assert len(kv_cache_specs) == len(available_gpu_memory) # Get the kv cache tensor size - if any(kv_cache_specs): - kv_cache_configs = [ - get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, - available_gpu_memory_one_worker) - for kv_cache_spec_one_worker, available_gpu_memory_one_worker - in zip(kv_cache_specs, available_gpu_memory) - ] - - for kv_cache_spec_one_worker in kv_cache_specs: - for _, spec in kv_cache_spec_one_worker.items(): - if isinstance(spec, AttentionSpec) and \ - spec.attn_type != "decoder": - - logger.info("Found non-decoder layer. Disabling " - "prefix cache and chunked prefill") - self.vllm_config.cache_config.\ - enable_prefix_caching = False - self.vllm_config.scheduler_config.\ - enable_chunked_prefill = False - self.vllm_config.scheduler_config.\ - chunked_prefill_enabled = False - self.vllm_config.scheduler_config.\ - long_prefill_token_threshold = 0 - break - - # Since we use a shared centralized controller, we need the - # `kv_cache_config` to be consistent across all workers to make sure - # all the memory operators can be applied to all workers. - unify_kv_cache_configs(kv_cache_configs) - # All workers have the same kv_cache_config except layer names, - # so use an arbitrary one to get the number of blocks. - assert all([ - cfg.num_blocks == kv_cache_configs[0].num_blocks - for cfg in kv_cache_configs - ]) - num_gpu_blocks = kv_cache_configs[0].num_blocks - else: - kv_cache_configs = [ - KVCacheConfig(num_blocks=1, - kv_cache_tensors=[], - kv_cache_groups=[]) - for kv_cache_spec_one_worker in kv_cache_specs - ] + kv_cache_configs = [ + get_kv_cache_config(vllm_config, kv_cache_spec_one_worker, + available_gpu_memory_one_worker) + for kv_cache_spec_one_worker, available_gpu_memory_one_worker in + zip(kv_cache_specs, available_gpu_memory) + ] + + for kv_cache_spec_one_worker in kv_cache_specs: + for _, spec in kv_cache_spec_one_worker.items(): + if isinstance(spec, AttentionSpec) and \ + spec.attn_type != "decoder": + + logger.info("Found non-decoder layer. Disabling " + "prefix cache and chunked prefill") + self.vllm_config.cache_config.\ + enable_prefix_caching = False + self.vllm_config.scheduler_config.\ + enable_chunked_prefill = False + self.vllm_config.scheduler_config.\ + chunked_prefill_enabled = False + self.vllm_config.scheduler_config.\ + long_prefill_token_threshold = 0 + break - num_gpu_blocks = 1 + # Since we use a shared centralized controller, we need the + # `kv_cache_config` to be consistent across all workers to make sure + # all the memory operators can be applied to all workers. + unify_kv_cache_configs(kv_cache_configs) + # All workers have the same kv_cache_config except layer names, + # so use an arbitrary one to get the number of blocks. + assert all([ + cfg.num_blocks == kv_cache_configs[0].num_blocks + for cfg in kv_cache_configs + ]) + num_gpu_blocks = kv_cache_configs[0].num_blocks # All workers have the same kv_cache_config except layer names, so use # an arbitrary one to initialize the scheduler. From b3a049150067354d65f67fbab1ca9e155574a472 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 6 Jun 2025 19:34:12 -0300 Subject: [PATCH 29/63] remove duplicated code Signed-off-by: Max de Bayser --- vllm/v1/engine/core.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 191f892d01f5..bfd47be3afc7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -170,13 +170,6 @@ def _initialize_kv_caches( # `kv_cache_config` to be consistent across all workers to make sure # all the memory operators can be applied to all workers. unify_kv_cache_configs(kv_cache_configs) - # All workers have the same kv_cache_config except layer names, - # so use an arbitrary one to get the number of blocks. - assert all([ - cfg.num_blocks == kv_cache_configs[0].num_blocks - for cfg in kv_cache_configs - ]) - num_gpu_blocks = kv_cache_configs[0].num_blocks # All workers have the same kv_cache_config except layer names, so use # an arbitrary one to initialize the scheduler. From 1a82e567bc83bafde84e11bce9668087a21599af Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 6 Jun 2025 21:21:56 -0300 Subject: [PATCH 30/63] remove encoder model support to simplify PR Signed-off-by: Max de Bayser --- .../models/language/pooling/test_embedding.py | 16 ++-- tests/models/language/pooling/test_jina.py | 8 -- tests/models/language/pooling/test_scoring.py | 9 --- vllm/model_executor/models/bert.py | 22 +++--- vllm/model_executor/models/roberta.py | 77 ++++++++----------- vllm/v1/attention/backends/flash_attn.py | 30 ++------ vllm/v1/core/kv_cache_utils.py | 1 - vllm/v1/core/sched/output.py | 2 - vllm/v1/engine/__init__.py | 1 - vllm/v1/engine/core.py | 19 +---- vllm/v1/engine/processor.py | 1 - vllm/v1/kv_cache_interface.py | 1 - vllm/v1/request.py | 3 - vllm/v1/worker/gpu_input_batch.py | 29 ------- vllm/v1/worker/gpu_model_runner.py | 46 +++-------- vllm/v1/worker/tpu_model_runner.py | 3 - 16 files changed, 70 insertions(+), 198 deletions(-) diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index b1e009fd29cc..77207fcbcca3 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -36,13 +36,19 @@ def v1(run_with_both_engines): marks=[pytest.mark.skip_v0]), # [Encoder-only] pytest.param("BAAI/bge-base-en-v1.5", - marks=[pytest.mark.core_model, pytest.mark.cpu_model]), - pytest.param("sentence-transformers/all-MiniLM-L12-v2"), - pytest.param("intfloat/multilingual-e5-small"), + marks=[ + pytest.mark.core_model, pytest.mark.cpu_model, + pytest.mark.skip_v1 + ]), + pytest.param("sentence-transformers/all-MiniLM-L12-v2", + marks=[pytest.mark.skip_v1]), + pytest.param("intfloat/multilingual-e5-small", + marks=[pytest.mark.skip_v1]), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct", - marks=[pytest.mark.skip_v0]), + marks=[pytest.mark.skip_v1]), # [Cross-Encoder] - pytest.param("sentence-transformers/stsb-roberta-base-v2"), + pytest.param("sentence-transformers/stsb-roberta-base-v2", + marks=[pytest.mark.skip_v1]), ], ) def test_models( diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 69b53149c927..33255021ad6a 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -36,14 +36,6 @@ ] -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - @pytest.fixture(scope="module", params=SCORING_MODELS) def model_name(request): yield request.param diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py index 1cf2cdc01320..c75ff1445616 100644 --- a/tests/models/language/pooling/test_scoring.py +++ b/tests/models/language/pooling/test_scoring.py @@ -23,15 +23,6 @@ "The capital of Germany is Berlin.", ] - -@pytest.fixture(autouse=True) -def v1(run_with_both_engines): - # Simple autouse wrapper to run both engines for each test - # This can be promoted up to conftest.py to run for every - # test in a package - pass - - DTYPE = "half" diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 5b98cbaafe79..8d12f91d92d4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -12,6 +12,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.forward_context import get_forward_context from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, @@ -27,7 +28,7 @@ from vllm.transformers_utils.config import ( get_cross_encoder_activation_function) -from .interfaces import SupportsCrossEncoding, SupportsQuant +from .interfaces import SupportsCrossEncoding, SupportsQuant, SupportsV0Only from .utils import WeightsMapper, maybe_prefix @@ -56,6 +57,7 @@ def __init__(self, config: BertConfig): def forward( self, input_ids: torch.Tensor, + seq_lens: torch.Tensor, position_ids: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -340,9 +342,13 @@ def forward( if inputs_embeds is not None: hidden_states = inputs_embeds else: - hidden_states = self.embeddings(input_ids=input_ids, - position_ids=position_ids, - token_type_ids=token_type_ids) + attn_metadata = get_forward_context().attn_metadata + assert hasattr(attn_metadata, "seq_lens_tensor") + hidden_states = self.embeddings( + input_ids=input_ids, + seq_lens=attn_metadata.seq_lens_tensor, + position_ids=position_ids, + token_type_ids=token_type_ids) return self.encoder(hidden_states) def load_weights(self, weights: Iterable[tuple[str, @@ -382,7 +388,7 @@ def load_weights(self, weights: Iterable[tuple[str, return loaded_params -class BertEmbeddingModel(nn.Module, SupportsQuant): +class BertEmbeddingModel(nn.Module, SupportsV0Only, SupportsQuant): """A model that uses Bert to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for @@ -405,13 +411,11 @@ def forward( self, input_ids: Optional[torch.Tensor], positions: torch.Tensor, - token_type_ids: Optional[torch.Tensor] = None, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids=input_ids, position_ids=positions, - token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, intermediate_tensors=intermediate_tensors) @@ -447,8 +451,8 @@ def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: softmax=False) -class BertForSequenceClassification(nn.Module, SupportsCrossEncoding, - SupportsQuant): +class BertForSequenceClassification(nn.Module, SupportsV0Only, + SupportsCrossEncoding, SupportsQuant): """A model that uses Bert to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 3c250819c8e9..8fa8b89798d0 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -3,14 +3,13 @@ import itertools from collections.abc import Iterable -from typing import Any, Optional, Union +from typing import Optional, Union import torch from torch import nn from transformers import RobertaConfig from vllm.config import VllmConfig -from vllm.forward_context import get_forward_context from vllm.model_executor.layers.pooler import ClassifierPooler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -23,7 +22,7 @@ get_cross_encoder_activation_function) from .bert_with_rope import BertWithRope, JinaRobertaModel -from .interfaces import SupportsCrossEncoding +from .interfaces import SupportsCrossEncoding, SupportsV0Only class RobertaEmbedding(nn.Module): @@ -53,53 +52,38 @@ def __init__(self, config: RobertaConfig): def forward( self, input_ids: torch.Tensor, + seq_lens: torch.Tensor, position_ids: torch.Tensor, token_type_ids: Optional[torch.Tensor] = None, ) -> torch.Tensor: - input_shape = input_ids.size() inputs_embeds = self.word_embeddings(input_ids) - attn_metadata = get_forward_context().attn_metadata - - seq_lens: Optional[torch.Tensor] = None - attn_metadata: Any = get_forward_context().attn_metadata - if attn_metadata is not None: - if isinstance(attn_metadata, dict): - attn_metadata = next(iter(attn_metadata.values())) - assert hasattr(attn_metadata, "seq_lens") - seq_lens = attn_metadata.seq_lens - else: - assert hasattr(attn_metadata, "seq_lens_tensor") - seq_lens = attn_metadata.seq_lens_tensor - - if seq_lens is not None: - # Replace position ids because in RoBERTa models - # they have to start at padding_idx + 1 and ignore - # existing padding tokens - # References: - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 - # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 - pos_list = [] - token_list = [] - offset = 0 - for seq_len in seq_lens: - pos_list.append(position_ids[offset:offset + seq_len]) - token_list.append(input_ids[offset:offset + seq_len]) - offset += seq_len - - offset = 0 - for positions, tokens in zip(pos_list, token_list): - # Verify assumption that incoming position are - # always a sequence from 0 to N. - expected_pos = torch.arange(positions.size()[0], - dtype=torch.long, - device=inputs_embeds.device) - assert torch.equal(positions, expected_pos) - new_pos = create_position_ids_from_input_ids( - tokens, self.padding_idx) - seq_len = new_pos.shape[0] - position_ids[offset:offset + seq_len] = new_pos - offset += seq_len + + # Replace position ids because in RoBERTa models + # they have to start at padding_idx + 1 and ignore + # existing padding tokens + # References: + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 + pos_list = [] + token_list = [] + offset = 0 + for seq_len in seq_lens: + pos_list.append(position_ids[offset:offset + seq_len]) + token_list.append(input_ids[offset:offset + seq_len]) + offset += seq_len + + new_pos_list = [] + for positions, tokens in zip(pos_list, token_list): + # Verify assumption that incoming position are + # always a sequence from 0 to N. + expected_pos = torch.arange(positions.size()[0], + dtype=torch.long, + device=inputs_embeds.device) + assert torch.equal(positions, expected_pos) + new_pos_list.append( + create_position_ids_from_input_ids(tokens, self.padding_idx)) + position_ids = torch.cat(new_pos_list) # Position embeddings. position_embeddings = self.position_embeddings(position_ids) @@ -166,7 +150,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): assert len(loaded), "Unable to load RobertaEmbeddingModel" -class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding): +class RobertaForSequenceClassification(nn.Module, SupportsCrossEncoding, + SupportsV0Only): """A model that uses Roberta to provide embedding functionalities. This class encapsulates the BertModel and provides an interface for diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index abad1c79b5ae..91a7c43cd8d8 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -548,13 +548,11 @@ def __init__( f"Supported head sizes are: {support_head_sizes}. " "Set VLLM_USE_V1=0 to use another attention backend.") - if attn_type not in [ - AttentionType.DECODER, AttentionType.ENCODER_ONLY - ]: - raise NotImplementedError("Encoder/decoder cross-attention " - "is not implemented for " + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " "FlashAttentionImpl") - self.attn_type = attn_type self.use_irope = use_irope self.vllm_flash_attn_version = get_flash_attn_version() if is_quantized_kv_cache(self.kv_cache_dtype) \ @@ -667,7 +665,7 @@ def forward( seqused_k=seqused_k, max_seqlen_k=max_seqlen_k, softmax_scale=self.scale, - causal=_get_causal_option(self.attn_type), + causal=True, alibi_slopes=self.alibi_slopes, window_size=self.sliding_window, block_table=block_table, @@ -869,21 +867,3 @@ def cascade_attention( # Merge prefix and suffix outputs, and store the result in output. merge_attn_states(output, prefix_output, prefix_lse, suffix_output, suffix_lse) - - -def _get_causal_option(attn_type: str) -> bool: - """ - Determine whether the given attention type is suitable for causal - attention mechanisms. - - Args: - attn_type (AttentionType): The type of attention being evaluated - - Returns: - bool: Returns `True` if the attention type is suitable for causal - attention (i.e., not encoder, encoder-only, or encoder-decoder), - otherwise returns `False`. - """ - return not (attn_type == AttentionType.ENCODER - or attn_type == AttentionType.ENCODER_ONLY - or attn_type == AttentionType.ENCODER_DECODER) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 65d880bcc7e6..6d4bcfe64a35 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -914,7 +914,6 @@ def is_hybrid(kv_cache_spec: dict[str, KVCacheSpec]) -> bool: dtype=spec.dtype, use_mla=spec.use_mla, sliding_window=spec.sliding_window, - attn_type=str(spec.attn_type), ) if is_hybrid(kv_cache_spec): diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 1a2e5e692821..eb2940aa57d6 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -24,7 +24,6 @@ class NewRequestData: req_id: str prompt_token_ids: list[int] - token_type_ids: Optional[list[int]] mm_inputs: list[MultiModalKwargs] mm_hashes: list[str] mm_positions: list[PlaceholderRange] @@ -43,7 +42,6 @@ def from_request( return cls( req_id=request.request_id, prompt_token_ids=request.prompt_token_ids, - token_type_ids=request.token_type_ids, mm_inputs=request.mm_inputs, mm_hashes=request.mm_hashes, mm_positions=request.mm_positions, diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 24d4a28eb7a3..4d1696a9b43a 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -49,7 +49,6 @@ class EngineCoreRequest( request_id: str prompt_token_ids: list[int] - token_type_ids: Optional[list[int]] mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] mm_hashes: Optional[list[str]] mm_placeholders: Optional[list[PlaceholderRange]] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index bfd47be3afc7..3303e46a8d70 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -36,7 +36,7 @@ EngineCoreRequestType, UtilityOutput) from vllm.v1.engine.mm_input_cache import MirroredProcessingCache from vllm.v1.executor.abstract import Executor -from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.metrics.stats import SchedulerStats from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus @@ -149,23 +149,6 @@ def _initialize_kv_caches( zip(kv_cache_specs, available_gpu_memory) ] - for kv_cache_spec_one_worker in kv_cache_specs: - for _, spec in kv_cache_spec_one_worker.items(): - if isinstance(spec, AttentionSpec) and \ - spec.attn_type != "decoder": - - logger.info("Found non-decoder layer. Disabling " - "prefix cache and chunked prefill") - self.vllm_config.cache_config.\ - enable_prefix_caching = False - self.vllm_config.scheduler_config.\ - enable_chunked_prefill = False - self.vllm_config.scheduler_config.\ - chunked_prefill_enabled = False - self.vllm_config.scheduler_config.\ - long_prefill_token_threshold = 0 - break - # Since we use a shared centralized controller, we need the # `kv_cache_config` to be consistent across all workers to make sure # all the memory operators can be applied to all workers. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index a813f0a8df71..b00f1444c7b3 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -331,7 +331,6 @@ def process_inputs( return decoder_inputs.get("prompt"), EngineCoreRequest( request_id=request_id, prompt_token_ids=decoder_inputs["prompt_token_ids"], - token_type_ids=decoder_inputs.get("token_type_ids"), mm_inputs=sorted_mm_inputs, mm_hashes=sorted_mm_hashes, mm_placeholders=sorted_mm_positions, diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 24815447c4c6..e938f3bfc671 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -74,7 +74,6 @@ class AttentionSpec(KVCacheSpec): head_size: int dtype: torch.dtype use_mla: bool - attn_type: str @property def page_size_bytes(self) -> int: diff --git a/vllm/v1/request.py b/vllm/v1/request.py index c6b666d7e968..c95e4371c684 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -23,7 +23,6 @@ def __init__( self, request_id: str, prompt_token_ids: list[int], - token_type_ids: Optional[list[int]], multi_modal_inputs: Optional[list[MultiModalKwargs]], multi_modal_hashes: Optional[list[str]], multi_modal_placeholders: Optional[list[PlaceholderRange]], @@ -69,7 +68,6 @@ def __init__( "sampling_params and pooling_params can't both be set") self.prompt_token_ids = prompt_token_ids - self.token_type_ids = token_type_ids self.num_prompt_tokens = len(self.prompt_token_ids) self._output_token_ids: list[int] = [] self._all_token_ids: list[int] = self.prompt_token_ids.copy() @@ -110,7 +108,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": request_id=request.request_id, client_index=request.client_index, prompt_token_ids=request.prompt_token_ids, - token_type_ids=request.token_type_ids, multi_modal_inputs=request.mm_inputs, multi_modal_hashes=request.mm_hashes, multi_modal_placeholders=request.mm_placeholders, diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index b98c4563795b..2d76a085a2ad 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -27,7 +27,6 @@ class CachedRequestState: req_id: str prompt_token_ids: list[int] - token_type_ids: Optional[list[int]] mm_inputs: list[MultiModalKwargs] mm_positions: list[PlaceholderRange] sampling_params: Optional[SamplingParams] @@ -90,8 +89,6 @@ def __init__( pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() - self.token_type_ids_cpu_tensor = None - self._token_type_ids_cpu = None self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_tokens_no_spec = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) @@ -234,22 +231,6 @@ def __init__( self.pooling_params: dict[str, PoolingParams] = {} - @property - def token_type_ids_cpu(self) -> np.ndarray: - if self._token_type_ids_cpu is None: - self.token_type_ids_cpu_tensor = torch.zeros( - self.token_ids_cpu_tensor.shape, - device="cpu", - dtype=torch.int8, - pin_memory=False, - ) - self._token_type_ids_cpu = cast( - torch.Tensor, self.token_type_ids_cpu_tensor).numpy() - return self._token_type_ids_cpu - - def has_token_types(self) -> bool: - return self._token_type_ids_cpu is not None - @property def req_ids(self) -> list[str]: # None elements should only be present transiently @@ -280,9 +261,6 @@ def add_request( self.num_prompt_tokens[req_index] = num_prompt_tokens self.token_ids_cpu[ req_index, :num_prompt_tokens] = request.prompt_token_ids - if request.token_type_ids is not None: - self.token_type_ids_cpu[ - req_index, :num_prompt_tokens] = request.token_type_ids start_idx = num_prompt_tokens end_idx = start_idx + len(request.output_token_ids) self.token_ids_cpu[req_index, @@ -469,10 +447,6 @@ def swap_states(self, i1: int, i2: int) -> None: tmp = self.token_ids_cpu[i1, ...].copy() self.token_ids_cpu[i1, ...] = self.token_ids_cpu[i2, ...] self.token_ids_cpu[i2, ...] = tmp - if self.has_token_types(): - tmp2 = self.token_type_ids_cpu[i1, ...].copy() - self.token_type_ids_cpu[i1, ...] = self.token_type_ids_cpu[i2, ...] - self.token_type_ids_cpu[i2, ...] = tmp2 swap_dict_values(self.generators, i1, i2) swap_dict_values(self.min_tokens, i1, i2) @@ -524,9 +498,6 @@ def condense(self, empty_req_indices: list[int]) -> None: num_tokens = self.num_tokens[last_req_index] self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ last_req_index, :num_tokens] - if self.has_token_types(): - self.token_type_ids_cpu[empty_index, :num_tokens] = \ - self.token_type_ids_cpu[last_req_index, :num_tokens] self.num_tokens[empty_index] = num_tokens self.num_tokens_no_spec[empty_index] = self.num_tokens_no_spec[ last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2836b5f7fe8d..1091fbc92243 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5,7 +5,7 @@ import gc import time import weakref -from typing import TYPE_CHECKING, Any, Optional, Union, cast +from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np import torch @@ -223,7 +223,7 @@ def __init__( self.slot_mapping = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) - self.token_type_ids = None + # None in the first PP rank. The rest are set after load_model. self.intermediate_tensors: Optional[IntermediateTensors] = None @@ -291,13 +291,6 @@ def __init__( # from the KV cache of `shared_kv_cache_layers[layer_name]`. self.shared_kv_cache_layers: dict[str, str] = {} - def get_token_type_ids(self) -> Optional[torch.Tensor]: - if self.token_type_ids is None: - self.token_type_ids = torch.zeros(self.max_num_tokens, - dtype=torch.int8, - device=self.device) - return self.token_type_ids - def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: """ Update the order of requests in the batch based on the attention @@ -401,7 +394,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, - token_type_ids=new_req_data.token_type_ids, mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, sampling_params=sampling_params, @@ -616,13 +608,6 @@ def _prepare_inputs( 0, torch.from_numpy(token_indices), out=self.input_ids_cpu[:total_num_scheduled_tokens]) - if self.input_batch.token_type_ids_cpu_tensor is not None: - token_type_ids = torch.index_select( - self.input_batch.token_type_ids_cpu_tensor.flatten(), 0, - torch.from_numpy(token_indices)) - # Copy the tensors to the GPU. - self.get_token_type_ids()[:total_num_scheduled_tokens]\ - .copy_(token_type_ids, non_blocking=True) # Calculate the slot mapping for each KV cache group. for kv_cache_group_id, kv_cache_group_spec in enumerate( @@ -1235,17 +1220,11 @@ def execute_model( else: mm_embeds = [] - has_token_types = self.token_type_ids is not None - model_kwargs = {} - if self.is_multimodal_model and get_pp_group().is_first_rank: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) # as input to the multimodal model, even when the input is text. input_ids = self.input_ids[:num_scheduled_tokens] - if has_token_types: - model_kwargs["token_type_ids"] = cast( - torch.Tensor, self.token_type_ids)[:num_scheduled_tokens] if mm_embeds: inputs_embeds = self.model.get_input_embeddings( input_ids, mm_embeds) @@ -1261,9 +1240,6 @@ def execute_model( # multimodal models, it is not desirable for performance since # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids[:num_input_tokens] - if has_token_types: - model_kwargs["token_type_ids"] = cast( - torch.Tensor, self.token_type_ids)[:num_input_tokens] inputs_embeds = None if self.uses_mrope: positions = self.mrope_positions[:, :num_input_tokens] @@ -1289,7 +1265,6 @@ def execute_model( positions=positions, intermediate_tensors=intermediate_tensors, inputs_embeds=inputs_embeds, - **model_kwargs, ) self.maybe_wait_for_kv_save() @@ -2134,7 +2109,7 @@ def capture_model(self) -> None: # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. with graph_capture(device=self.device): - skip_attn = False + skip_attn = not self.vllm_config.compilation_config.full_cuda_graph for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): @@ -2392,9 +2367,7 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: continue # TODO: Support other attention modules, e.g., cross-attention - # encoder only can also benefit from KV cache for prefix caching - if attn_module.attn_type in (AttentionType.DECODER, - AttentionType.ENCODER_ONLY): + if attn_module.attn_type == AttentionType.DECODER: if attn_module.sliding_window is not None: kv_cache_spec[layer_name] = SlidingWindowSpec( block_size=block_size, @@ -2402,18 +2375,17 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: head_size=attn_module.head_size, dtype=self.kv_cache_dtype, sliding_window=attn_module.sliding_window, - use_mla=use_mla, - attn_type=str(attn_module.attn_type)) + use_mla=use_mla) else: kv_cache_spec[layer_name] = FullAttentionSpec( block_size=block_size, num_kv_heads=attn_module.num_kv_heads, head_size=attn_module.head_size, dtype=self.kv_cache_dtype, - use_mla=use_mla, - attn_type=str(attn_module.attn_type)) - elif attn_module.attn_type == AttentionType.ENCODER: - # encoder attention does not need KV cache. + use_mla=use_mla) + elif attn_module.attn_type in (AttentionType.ENCODER, + AttentionType.ENCODER_ONLY): + # encoder-only attention does not need KV cache. continue elif attn_module.attn_type == AttentionType.ENCODER_DECODER: raise NotImplementedError diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index d2257f9aa016..14f5a16dd320 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -394,7 +394,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool: self.requests[req_id] = CachedRequestState( req_id=req_id, prompt_token_ids=new_req_data.prompt_token_ids, - token_type_ids=new_req_data.token_type_ids, mm_inputs=new_req_data.mm_inputs, mm_positions=new_req_data.mm_positions, sampling_params=sampling_params, @@ -497,7 +496,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: dtype=self.kv_cache_dtype, sliding_window=attn_module.sliding_window, use_mla=False, - attn_type=str(attn_module.attn_type), ) else: kv_cache_spec[layer_name] = FullAttentionSpec( @@ -506,7 +504,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: head_size=attn_module.head_size, dtype=self.kv_cache_dtype, use_mla=False, - attn_type=str(attn_module.attn_type), ) elif attn_module.attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY): From 660dd9c0c01f77661917ecb6bbd97d49e5773eb8 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 13:26:53 -0300 Subject: [PATCH 31/63] fix several tests Signed-off-by: Max de Bayser --- tests/entrypoints/openai/test_pooling.py | 15 +++++++++---- .../models/language/generation/test_common.py | 2 ++ tests/models/registry.py | 22 +++++++++---------- tests/tokenization/test_detokenize.py | 1 + tests/v1/core/test_kv_cache_utils.py | 1 + tests/v1/core/test_prefix_caching.py | 1 + tests/v1/core/test_scheduler.py | 1 + vllm/entrypoints/openai/serving_pooling.py | 4 +++- vllm/model_executor/models/modernbert.py | 5 +++-- vllm/v1/worker/gpu_model_runner.py | 13 +++-------- 10 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tests/entrypoints/openai/test_pooling.py b/tests/entrypoints/openai/test_pooling.py index cf16ace6537a..41c30e71684b 100644 --- a/tests/entrypoints/openai/test_pooling.py +++ b/tests/entrypoints/openai/test_pooling.py @@ -7,6 +7,7 @@ import pytest import requests +from tests.models.utils import check_embeddings_close from vllm.entrypoints.openai.protocol import PoolingResponse from vllm.transformers_utils.tokenizer import get_tokenizer @@ -223,8 +224,11 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, np.frombuffer(base64.b64decode(data.data), dtype="float32").tolist()) - assert responses_float.data[0].data == decoded_responses_base64_data[0] - assert responses_float.data[1].data == decoded_responses_base64_data[1] + check_embeddings_close( + embeddings_0_lst=[d.data for d in responses_float.data], + embeddings_1_lst=decoded_responses_base64_data, + name_0="float32", + name_1="base64") # Default response is float32 decoded from base64 by OpenAI Client default_response = requests.post( @@ -237,5 +241,8 @@ async def test_batch_base64_pooling(server: RemoteOpenAIServer, default_response.raise_for_status() responses_default = PoolingResponse.model_validate(default_response.json()) - assert responses_float.data[0].data == responses_default.data[0].data - assert responses_float.data[1].data == responses_default.data[1].data + check_embeddings_close( + embeddings_0_lst=[d.data for d in responses_default.data], + embeddings_1_lst=[d.data for d in responses_default.data], + name_0="float32", + name_1="base64") diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index f656f90c4bd3..bfcfef04db47 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -146,6 +146,8 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs( prompt_embeds, max_tokens, num_logprobs) + print(f"{hf_outputs=}") + print(f"{vllm_outputs=}") check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, diff --git a/tests/models/registry.py b/tests/models/registry.py index e6543c197348..d98d04bdce74 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -262,8 +262,8 @@ def check_available_online( _EMBEDDING_EXAMPLE_MODELS = { # [Text-only] - "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), - "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), + "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5", v0_only=True), + "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2", v0_only=True), # noqa: E501 "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0", trust_remote_code=True), @@ -276,16 +276,16 @@ def check_available_online( "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), "ModernBertModel": _HfExamplesInfo("Alibaba-NLP/gte-modernbert-base", - trust_remote_code=True), + trust_remote_code=True, v0_only=True), "NomicBertModel": _HfExamplesInfo("nomic-ai/nomic-embed-text-v2-moe", - trust_remote_code=True), + trust_remote_code=True, v0_only=True), # noqa: E501 "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"), "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"), "Qwen2ForProcessRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-PRM-7B"), "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501 - "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501 - "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501 - "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"), + "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2", v0_only=True), # noqa: E501 + "RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1", v0_only=True), # noqa: E501 + "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small", v0_only=True), # noqa: E501 # [Multimodal] "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"), "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full", @@ -297,10 +297,10 @@ def check_available_online( _CROSS_ENCODER_EXAMPLE_MODELS = { # [Text-only] - "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2"), # noqa: E501 - "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base"), # noqa: E501 - "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3"), # noqa: E501 - "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base"), # noqa: E501 + "BertForSequenceClassification": _HfExamplesInfo("cross-encoder/ms-marco-MiniLM-L-6-v2", v0_only=True), # noqa: E501 + "RobertaForSequenceClassification": _HfExamplesInfo("cross-encoder/quora-roberta-base", v0_only=True), # noqa: E501 + "XLMRobertaForSequenceClassification": _HfExamplesInfo("BAAI/bge-reranker-v2-m3", v0_only=True), # noqa: E501 + "ModernBertForSequenceClassification": _HfExamplesInfo("Alibaba-NLP/gte-reranker-modernbert-base", v0_only=True), # noqa: E501 } _MULTIMODAL_EXAMPLE_MODELS = { diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index 9f2414eca24f..f8aeba8301b1 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -68,6 +68,7 @@ def _run_incremental_decode(tokenizer, None, params, None, + None, 0.0, None, cache_salt=None, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index ab7aa02823ab..c0a919caf3fd 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -43,6 +43,7 @@ def make_request(request_id, multi_modal_hashes=mm_hashes, multi_modal_placeholders=mm_positions, sampling_params=SamplingParams(max_tokens=17), + pooling_params=None, eos_token_id=100, lora_request=None, cache_salt=cache_salt, diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index bf4cb539ebef..120b8ef08af7 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -39,6 +39,7 @@ def make_request(request_id, multi_modal_placeholders=mm_positions, sampling_params=SamplingParams(max_tokens=17, prompt_logprobs=prompt_logprobs), + pooling_params=None, eos_token_id=100, lora_request=None, cache_salt=cache_salt, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index d348956aa177..3a91d3ca9e79 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -135,6 +135,7 @@ def create_requests(num_requests: int, request_id=f"{i}", prompt_token_ids=[i] * num_tokens, sampling_params=sampling_params, + pooling_params=None, multi_modal_inputs=mm_inputs, multi_modal_placeholders=mm_position, multi_modal_hashes=None, diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index b896cc46b9d0..c2ed50d04d12 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -9,6 +9,7 @@ import jinja2 import numpy as np +import torch from fastapi import Request from typing_extensions import assert_never @@ -39,7 +40,8 @@ def _get_data( elif encoding_format == "base64": # Force to use float32 for base64 encoding # to match the OpenAI python client behavior - pooling_bytes = np.array(output.data, dtype="float32").tobytes() + pt_float32 = output.data.to(dtype=torch.float32) + pooling_bytes = np.array(pt_float32, dtype="float32").tobytes() return base64.b64encode(pooling_bytes).decode("utf-8") assert_never(encoding_format) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 35f416a6e21e..7c1f889e8f38 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -21,7 +21,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.sequence import IntermediateTensors, PoolerOutput -from .interfaces import SupportsCrossEncoding +from .interfaces import SupportsCrossEncoding, SupportsV0Only from .utils import WeightsMapper, maybe_prefix @@ -270,7 +270,8 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: return pooled_output -class ModernBertForSequenceClassification(nn.Module, SupportsCrossEncoding): +class ModernBertForSequenceClassification(nn.Module, SupportsV0Only, + SupportsCrossEncoding): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1091fbc92243..df9b3f5319cf 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1801,9 +1801,9 @@ def _dummy_run( num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32) - self.seq_lens.fill_(0) - seq_lens = self.seq_lens[:num_reqs] - seq_lens.copy_(torch.from_numpy(num_scheduled_tokens)) + #self.seq_lens.fill_(0) + #seq_lens = self.seq_lens[:num_reqs] + #seq_lens.copy_(torch.from_numpy(num_scheduled_tokens)) if skip_attn: attn_metadata: Optional[dict[str, Any]] = None @@ -1847,12 +1847,6 @@ def _dummy_run( else: positions = self.positions[:num_tokens] - offset = 0 - for seq_len in num_scheduled_tokens_list: - positions[offset:offset + seq_len] = torch.arange( - seq_len, dtype=positions.dtype) - offset += seq_len - if get_pp_group().is_first_rank: intermediate_tensors = None else: @@ -1878,7 +1872,6 @@ def _dummy_run( inputs_embeds=inputs_embeds, ) - positions = self.positions[:num_tokens].zero_() if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: From cdd70c9e260a1c0660327fc777e6d33ce56bb0e5 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 14:25:25 -0300 Subject: [PATCH 32/63] Fix test Signed-off-by: Max de Bayser --- tests/v1/core/test_scheduler.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 3a91d3ca9e79..b0b1116eb536 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -284,6 +284,7 @@ def test_schedule_partial_requests(): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) scheduler.update_from_output(output, model_runner_output) @@ -334,6 +335,7 @@ def test_no_mm_input_chunking(): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) scheduler.update_from_output(output, model_runner_output) @@ -397,6 +399,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) scheduler.update_from_output(output, model_runner_output) @@ -421,6 +424,7 @@ def test_schedule_concurrent_partial_requests(enable_prefix_caching: bool): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) scheduler.update_from_output(output1, model_runner_output) output2 = scheduler.schedule() @@ -474,7 +478,8 @@ def test_stop_via_update_from_output(): 11]], # First request hits EOS, second continues spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -524,7 +529,8 @@ def test_stop_via_update_from_output(): [13, 14]], # First request hits stop token spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -573,7 +579,8 @@ def test_stop_via_update_from_output(): [13]], # First request exceeds max_tokens spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -615,7 +622,8 @@ def test_stop_via_update_from_output(): sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]], spec_token_ids=None, logprobs=None, - prompt_logprobs_dict={}) + prompt_logprobs_dict={}, + pooler_output=[]) scheduler.update_from_output(scheduler_output, model_output) @@ -664,6 +672,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) scheduler.update_from_output(scheduler_output0, model_runner_output) @@ -681,6 +690,7 @@ def test_schedule_concurrent_batches(enable_prefix_caching: Optional[bool], spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) scheduler.update_from_output(scheduler_output1, model_runner_output) @@ -731,6 +741,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): spec_token_ids=spec_tokens, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) engine_core_outputs = scheduler.update_from_output(output, model_runner_output) @@ -770,6 +781,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) engine_core_outputs = scheduler.update_from_output(output, model_runner_output) @@ -897,6 +909,7 @@ def test_kv_connector_basic(): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) # Ensure ScheduleOutput is correct. @@ -942,6 +955,7 @@ def test_kv_connector_basic(): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) # We should get a local cache hit of NUM_TOKENS_PREFIX and @@ -1008,6 +1022,7 @@ def test_kv_connector_unable_to_allocate(): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) # Just one request should be running. @@ -1088,6 +1103,7 @@ def test_kv_connector_handles_preemption(): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) # All can be scheduled - 1st token. @@ -1182,6 +1198,7 @@ def make_output(scheduler: Scheduler): spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=[], ) From 08321158769f8c03ed73b6f30a12d38a16d35364 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 16:04:57 -0300 Subject: [PATCH 33/63] disable bert test Signed-off-by: Max de Bayser --- tests/compile/test_basic_correctness.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index dc6cfe9daccd..a6de4bcac05d 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -68,16 +68,17 @@ class TestSetting: method="encode", fullgraph=True, ), - # encoder-based embedding model (BERT) - TestSetting( - model="BAAI/bge-base-en-v1.5", - model_args=["--task", "embed"], - pp_size=1, - tp_size=1, - attn_backend="XFORMERS", - method="encode", - fullgraph=True, - ), + # TODO: bert models are not supported in V1 yet + # # encoder-based embedding model (BERT) + # TestSetting( + # model="BAAI/bge-base-en-v1.5", + # model_args=["--task", "embed"], + # pp_size=1, + # tp_size=1, + # attn_backend="XFORMERS", + # method="encode", + # fullgraph=True, + # ), # vision language model TestSetting( model="microsoft/Phi-3.5-vision-instruct", From 10bbf74f38da3bb7cad2bd098892786e63a26dcb Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 18:20:35 -0300 Subject: [PATCH 34/63] fix tests Signed-off-by: Max de Bayser --- tests/v1/engine/test_engine_core_client.py | 1 + tests/v1/engine/test_output_processor.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index c2dc3b4731b5..c10de675a5a0 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -52,6 +52,7 @@ def make_request( mm_hashes=None, mm_placeholders=None, sampling_params=params, + pooling_params=None, eos_token_id=None, arrival_time=time.time(), lora_request=None, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index 6b88b0cf17e3..1c8c5f25e29b 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -66,7 +66,8 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind, output_kind=request_output_kind, stop=[], include_stop_str_in_output=False, - )) + ), + pooling_params=None) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] @@ -416,7 +417,8 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind, include_stop_str_in_output=False, logprobs=num_sample_logprobs, prompt_logprobs=num_prompt_logprobs, - )) + ), + pooling_params=None) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] @@ -582,7 +584,8 @@ def test_stop_token(include_stop_str_in_output: bool, logprobs=num_sample_logprobs, prompt_logprobs=None, ignore_eos=ignore_eos, - )) + ), + pooling_params=None) # Add request to the detokenizer. output_processor.add_request(request, prompt_string) @@ -678,7 +681,8 @@ def test_stop_string(include_stop_str_in_output: bool, include_stop_str_in_output=include_stop_str_in_output, logprobs=num_sample_logprobs, prompt_logprobs=None, - )) + ), + pooling_params=None) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] @@ -786,6 +790,7 @@ def test_iteration_stats(dummy_test_vectors): cache_salt=None, data_parallel_rank=None, sampling_params=SamplingParams(), + pooling_params=None, ) for idx, prompt_tokens in enumerate(dummy_test_vectors.prompt_tokens) ] From ee892aac5c325fb1c43270e1054ccba1841e82d8 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 19:29:04 -0300 Subject: [PATCH 35/63] limit context length to fit test GPU Signed-off-by: Max de Bayser --- tests/compile/test_basic_correctness.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index a6de4bcac05d..1ee9b234d9f4 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -31,7 +31,7 @@ class TestSetting: # basic llama model TestSetting( model="meta-llama/Llama-3.2-1B-Instruct", - model_args=[], + model_args=["--max-model-len", "2048"], pp_size=2, tp_size=2, attn_backend="FLASHINFER", @@ -41,7 +41,7 @@ class TestSetting: # llama model with quantization TestSetting( model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - model_args=["--quantization", "gptq"], + model_args=["--quantization", "gptq", "--max-model-len", "2048"], pp_size=1, tp_size=1, attn_backend="FLASH_ATTN", @@ -51,7 +51,7 @@ class TestSetting: # MoE model TestSetting( model="ibm/PowerMoE-3b", - model_args=[], + model_args=["--max-model-len", "2048"], pp_size=1, tp_size=2, attn_backend="FLASH_ATTN", @@ -61,7 +61,10 @@ class TestSetting: # embedding model TestSetting( model="BAAI/bge-multilingual-gemma2", - model_args=["--task", "embed", "--dtype", "bfloat16"], + model_args=[ + "--task", "embed", "--dtype", "bfloat16", "--max-model-len", + "2048" + ], pp_size=1, tp_size=1, attn_backend="FLASH_ATTN", From 2e12eba9c4fe3cb8b0bdcdc4260f6052665d7092 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 20:52:07 -0300 Subject: [PATCH 36/63] limit context length to fit test GPU Signed-off-by: Max de Bayser --- examples/offline_inference/vision_language_embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py index 1f5bd4ad72b0..9451825f0b73 100644 --- a/examples/offline_inference/vision_language_embedding.py +++ b/examples/offline_inference/vision_language_embedding.py @@ -94,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData: engine_args = EngineArgs( model="TIGER-Lab/VLM2Vec-Full", task="embed", + max_model_len=4096, trust_remote_code=True, mm_processor_kwargs={"num_crops": 4}, limit_mm_per_prompt={"image": 1}, From 14fcf2481b62b776e4f410733f7059e5ba488341 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 21:02:27 -0300 Subject: [PATCH 37/63] fix test Signed-off-by: Max de Bayser --- tests/v1/engine/test_engine_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 1cbbf30371af..ac10c7a0f976 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -38,6 +38,7 @@ def make_request() -> EngineCoreRequest: mm_hashes=None, mm_placeholders=None, sampling_params=SamplingParams(), + pooling_params=None, eos_token_id=None, arrival_time=time.time(), lora_request=None, From 0624435c607f733c2a234dcf14ee0fd64865fafc Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 9 Jun 2025 22:50:11 -0300 Subject: [PATCH 38/63] fix test Signed-off-by: Max de Bayser --- tests/v1/worker/test_gpu_model_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 3d51b53df2ce..0072af48ec51 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -123,6 +123,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), + pooling_params=None, block_ids=[[0]], num_computed_tokens=0, lora_request=None, From 051f6d44fd7ae1a750509d64d9c97f893e1a9464 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Tue, 10 Jun 2025 03:29:03 -0700 Subject: [PATCH 39/63] Fix _construct_cached_request_state Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/v1/worker/test_gpu_input_batch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index de6ebe4f6716..20d85bb85377 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -201,6 +201,7 @@ def _construct_cached_request_state(req_id_suffix: int): req_id=f"req_id_{req_id_suffix}", prompt_token_ids=prompt_token_ids, sampling_params=_create_sampling_params(), + pooling_params=None, mm_inputs=[], mm_positions=[], block_ids=([], ), From 214cf0661237aa17ca7d5ffed17c817480f3e7e2 Mon Sep 17 00:00:00 2001 From: 22quinn <33176974+22quinn@users.noreply.github.com> Date: Tue, 10 Jun 2025 03:52:01 -0700 Subject: [PATCH 40/63] Fix v1 tests Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com> --- tests/v1/worker/test_gpu_input_batch.py | 3 ++- vllm/v1/worker/gpu_input_batch.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 20d85bb85377..9e5e06cdc1f5 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -10,6 +10,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.v1.pool.metadata import PoolingMetadata from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -46,7 +47,7 @@ def _compare_objs(obj1, obj2): for a_i, b_i in zip(a.block_tables, b.block_tables): _compare_objs(a_i, b_i) is_same = True - elif isinstance(a, (BlockTable, SamplingMetadata)): + elif isinstance(a, (BlockTable, SamplingMetadata, PoolingMetadata)): _compare_objs(a, b) is_same = True # if we make it here must be same elif a == b: diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 4f2fa70b7e02..f9be13993bfe 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -611,13 +611,15 @@ def _make_sampling_metadata(self) -> SamplingMetadata: @property def pooling_metadata(self) -> PoolingMetadata: - - # Note, for now this assumes that all request in the batch - # are either sampling or pooling requests - assert len(self.req_ids) == len(self.pooling_params) - pooling_params = [ - self.pooling_params[req_id] for req_id in self.req_ids - ] + if len(self.pooling_params) == 0: + pooling_params = [] + else: + # Note, for now this assumes that all request in the batch + # are either sampling or pooling requests + assert len(self.req_ids) == len(self.pooling_params) + pooling_params = [ + self.pooling_params[req_id] for req_id in self.req_ids + ] return PoolingMetadata( prompt_lens=torch.from_numpy( From 65b8377c82dd055170a85731b52e60f5f59afd18 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 10 Jun 2025 09:21:08 -0300 Subject: [PATCH 41/63] fix test Signed-off-by: Max de Bayser --- tests/v1/worker/test_gpu_input_batch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 72547e86b0e9..70dd7e3c1be4 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -201,6 +201,7 @@ def _construct_cached_request_state(req_id_suffix: int): req_id=f"req_id_{req_id_suffix}", prompt_token_ids=prompt_token_ids, sampling_params=_create_sampling_params(), + pooling_params=None, mm_inputs=[], mm_positions=[], block_ids=[[]], From 4ee822aa95e9241955e89d8fa239eb2863bf94e2 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 10 Jun 2025 10:12:21 -0300 Subject: [PATCH 42/63] reduce max_model_len to fit in test gpu Signed-off-by: Max de Bayser --- tests/models/language/pooling/test_classification.py | 2 +- tests/models/language/pooling/test_embedding.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/language/pooling/test_classification.py b/tests/models/language/pooling/test_classification.py index 87c48bc981ce..77df6d16a367 100644 --- a/tests/models/language/pooling/test_classification.py +++ b/tests/models/language/pooling/test_classification.py @@ -37,7 +37,7 @@ def test_models( # switch to use ROCm CK FA backend monkeypatch.setenv("VLLM_USE_TRITON_FLASH_ATTN", "False") - with vllm_runner(model, dtype=dtype) as vllm_model: + with vllm_runner(model, max_model_len=512, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.classify(example_prompts) with hf_runner(model, diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py index 77207fcbcca3..e29b4f6e8bec 100644 --- a/tests/models/language/pooling/test_embedding.py +++ b/tests/models/language/pooling/test_embedding.py @@ -82,7 +82,7 @@ def test_models( with vllm_runner(model, task="embed", - max_model_len=None, + max_model_len=512, **vllm_extra_kwargs) as vllm_model: vllm_outputs = vllm_model.encode(example_prompts) From 72427310f20201d8f1937040f75b6a5839dc0525 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 10 Jun 2025 12:06:19 -0300 Subject: [PATCH 43/63] fix test Signed-off-by: Max de Bayser --- tests/v1/kv_connector/unit/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 4a9e3a7ad807..b0fdc8634a07 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -150,6 +150,7 @@ def create_request( request_id=f"id-{request_id}", prompt_token_ids=prompt_token_ids, sampling_params=sampling_params, + pooling_params=None, multi_modal_inputs=None, multi_modal_placeholders=None, multi_modal_hashes=None, From a4f460b63984a836ebddb924a26d525d2017ffd6 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 10 Jun 2025 13:46:45 -0300 Subject: [PATCH 44/63] fix test Signed-off-by: Max de Bayser --- tests/v1/kv_connector/unit/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index b0fdc8634a07..61f59f35f75b 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -184,6 +184,7 @@ def create_model_runner_output( spec_token_ids=None, logprobs=None, prompt_logprobs_dict={}, + pooler_output=None, finished_sending=finished_sending, finished_recving=finished_recving, ) From 17f61778fb574d2c4461b17bdd29d6d0820f6271 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 14:20:13 -0300 Subject: [PATCH 45/63] fix test Signed-off-by: Max de Bayser --- tests/v1/engine/test_fast_incdec_prefix_err.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py index 5c844e0e7095..f028b4ab1d73 100644 --- a/tests/v1/engine/test_fast_incdec_prefix_err.py +++ b/tests/v1/engine/test_fast_incdec_prefix_err.py @@ -33,6 +33,7 @@ def test_fast_inc_detok_invalid_utf8_err_case(): None, params, None, + None, 0.0, None, cache_salt=None, From 74d73ccbc2f11b9e71feb1ea673ad3807a69a53d Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 16:23:57 -0300 Subject: [PATCH 46/63] use torch.split Signed-off-by: Max de Bayser --- vllm/v1/worker/gpu_model_runner.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cb974bc304d8..27b195587e58 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1304,12 +1304,8 @@ def execute_model( "Either all or none of the requests in" \ " a batch must be pooling request" - offset = 0 - extracted_hidden_states = list[torch.Tensor]() - for seq_len in num_scheduled_tokens_np: - extracted_hidden_states.append( - hidden_states[offset:offset + seq_len]) - offset += seq_len + extracted_hidden_states = torch.split( + hidden_states, num_scheduled_tokens_np.tolist()) pooling_metadata = self.input_batch.pooling_metadata @@ -2005,11 +2001,8 @@ def _dummy_pooler_run( num_reqs = num_scheduled_tokens.shape[0] - offset = 0 - hidden_states_list = list[torch.Tensor]() - for seq_len in num_scheduled_tokens: - hidden_states_list.append(hidden_states[offset:offset + seq_len]) - offset += seq_len + hidden_states_list = torch.split(hidden_states, + num_scheduled_tokens.tolist()) req_num_tokens = num_tokens // num_reqs From e6a66dcea06e9a445855cc9136c10814ea7c1f4e Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:03:44 -0300 Subject: [PATCH 47/63] enable cuda graphs Signed-off-by: Max de Bayser --- vllm/config.py | 3 --- vllm/v1/worker/gpu_model_runner.py | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 84390f13a52e..952ab0d915c9 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -715,9 +715,6 @@ def _init_pooler_config(self) -> Optional["PoolerConfig"]: if isinstance(self.override_pooler_config, dict): self.override_pooler_config = PoolerConfig( **self.override_pooler_config) - logger.warning("CUDA graph is not supported for pooling yet, " - "fallback to the eager mode.") - self.enforce_eager = True pooler_config = self.override_pooler_config or PoolerConfig() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 27b195587e58..d4ed0b297f36 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1813,7 +1813,7 @@ def _dummy_run( self, num_tokens: int, skip_attn: bool = True, - ) -> tuple[torch.Tensor, torch.Tensor, int, np.ndarray]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Padding for DP num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens) @@ -1910,8 +1910,7 @@ def _dummy_run( self.drafter.dummy_run(num_tokens) logit_indices = np.cumsum(num_scheduled_tokens) - 1 - return hidden_states, hidden_states[ - logit_indices], num_reqs, num_scheduled_tokens + return hidden_states, hidden_states[logit_indices] @torch.inference_mode() def _dummy_sampler_run( @@ -1993,16 +1992,20 @@ def _dummy_sampler_run( @torch.inference_mode() def _dummy_pooler_run( self, - num_tokens: int, - num_reqs: int, hidden_states: torch.Tensor, - num_scheduled_tokens: np.ndarray, ) -> torch.Tensor: - num_reqs = num_scheduled_tokens.shape[0] + num_tokens = hidden_states.shape[0] + max_num_reqs = self.scheduler_config.max_num_seqs + num_reqs = min(num_tokens, max_num_reqs) + min_tokens_per_req = num_tokens // num_reqs + num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs + num_scheduled_tokens_list[-1] += num_tokens % num_reqs + assert sum(num_scheduled_tokens_list) == num_tokens + assert len(num_scheduled_tokens_list) == num_reqs hidden_states_list = torch.split(hidden_states, - num_scheduled_tokens.tolist()) + num_scheduled_tokens_list) req_num_tokens = num_tokens // num_reqs @@ -2098,13 +2101,11 @@ def profile_run(self) -> None: # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) - hidden_states, last_hidden_states, num_reqs, num_scheduled_tokens \ + hidden_states, last_hidden_states \ = self._dummy_run(self.max_num_tokens) if get_pp_group().is_last_rank: if self.is_pooling_model: - output = self._dummy_pooler_run(self.max_num_tokens, num_reqs, - hidden_states, - num_scheduled_tokens) + output = self._dummy_pooler_run(hidden_states) else: output = self._dummy_sampler_run(last_hidden_states) else: From 4cca7747fe56f1979c96f59d5a4fb5c1bbc1cad6 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:13:41 -0300 Subject: [PATCH 48/63] fix unecessary config.py changes Signed-off-by: Max de Bayser --- vllm/config.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 952ab0d915c9..fff1ccfbcd7d 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -4477,13 +4477,11 @@ def __post_init__(self): "Disabling `torch.compile`.") self.compilation_config.level = CompilationLevel.NO_COMPILATION - disable_cascade_reasons: list[str] = [] - if self.compilation_config.full_cuda_graph and \ not self.model_config.disable_cascade_attn: - disable_cascade_reasons.append( - "full_cuda_graph is not supported with " - "cascade attention. Disabling cascade attention.") + logger.info("full_cuda_graph is not supported with " + "cascade attention. Disabling cascade attention.") + self.model_config.disable_cascade_attn = True self.cache_config.enable_prefix_caching = False disable_chunked_prefill_reasons: list[str] = [] @@ -4495,13 +4493,9 @@ def __post_init__(self): "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") - disable_cascade_reasons.append( - "Loaded model for pooling; disabling cascade attention.") - if disable_chunked_prefill_reasons: for reason in disable_chunked_prefill_reasons: logger.info(reason) - self.scheduler_config.enable_chunked_prefill = False self.scheduler_config.chunked_prefill_enabled = False self.scheduler_config.long_prefill_token_threshold = 0 self.scheduler_config.max_num_batched_tokens = max( @@ -4511,11 +4505,6 @@ def __post_init__(self): if self.cache_config is not None: self.cache_config.enable_prefix_caching = False - if disable_cascade_reasons: - for reason in disable_cascade_reasons: - logger.info(reason) - self.model_config.disable_cascade_attn = True - if (self.kv_events_config is not None and self.kv_events_config.enable_kv_cache_events and not self.cache_config.enable_prefix_caching): From 8ef19825552bf1de96a49e6248c9a41784d4d268 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:21:05 -0300 Subject: [PATCH 49/63] fix error message Signed-off-by: Max de Bayser --- vllm/v1/request.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index c95e4371c684..d536479952d1 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -65,7 +65,7 @@ def __init__( sampling_params.extra_args.get("kv_transfer_params") else: raise ValueError( - "sampling_params and pooling_params can't both be set") + "sampling_params and pooling_params can't both be unset") self.prompt_token_ids = prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) From 28d00d171bbdaa4886bd92a86578d9ffe0b1ea23 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:21:59 -0300 Subject: [PATCH 50/63] remove unused import Signed-off-by: Max de Bayser --- vllm/v1/engine/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3303e46a8d70..d117e1db4ce4 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -16,7 +16,6 @@ import msgspec import zmq -import zmq.asyncio from vllm.config import ParallelConfig, VllmConfig from vllm.distributed import stateless_destroy_torch_distributed_process_group From e634f60bad49a43bd4566ae7f9e21efd72526fb8 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:25:56 -0300 Subject: [PATCH 51/63] fix docstring Signed-off-by: Max de Bayser --- vllm/v1/engine/async_llm.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 660afa71c445..998c4c5ea3cf 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -455,8 +455,7 @@ async def encode( Main function called by the API server to kick off a request * 1) Making an AsyncStream corresponding to the Request. * 2) Processing the Input. - * 3) Adding the Request to the Detokenizer. - * 4) Adding the Request to the EngineCore (separate process). + * 3) Adding the Request to the EngineCore (separate process). A separate output_handler loop runs in a background AsyncIO task, pulling outputs from EngineCore and putting them into the From 053475cff21d47368d428d9e6f37c2539ede4093 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:28:53 -0300 Subject: [PATCH 52/63] revert unnecessary code changes Signed-off-by: Max de Bayser --- vllm/v1/core/sched/utils.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 01c0ef402ae2..737694f8a304 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -19,15 +19,16 @@ def check_stop(request: Request, request.status = RequestStatus.FINISHED_STOPPED return True - if (sampling_params := request.sampling_params) is not None: - last_token_id = request.output_token_ids[-1] - if (not sampling_params.ignore_eos - and last_token_id == request.eos_token_id): - request.status = RequestStatus.FINISHED_STOPPED - return True + sampling_params = request.sampling_params + assert sampling_params is not None + last_token_id = request.output_token_ids[-1] + if (not sampling_params.ignore_eos + and last_token_id == request.eos_token_id): + request.status = RequestStatus.FINISHED_STOPPED + return True - if last_token_id in (sampling_params.stop_token_ids or ()): - request.status = RequestStatus.FINISHED_STOPPED - request.stop_reason = last_token_id - return True + if last_token_id in (sampling_params.stop_token_ids or ()): + request.status = RequestStatus.FINISHED_STOPPED + request.stop_reason = last_token_id + return True return False From 6228f644486f8e1067456a0fdb2c62b60c59d861 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 17:29:51 -0300 Subject: [PATCH 53/63] remove debug prints Signed-off-by: Max de Bayser --- tests/models/language/generation/test_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py index bfcfef04db47..f656f90c4bd3 100644 --- a/tests/models/language/generation/test_common.py +++ b/tests/models/language/generation/test_common.py @@ -146,8 +146,6 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str, vllm_outputs_from_embeds = vllm_model.generate_greedy_logprobs( prompt_embeds, max_tokens, num_logprobs) - print(f"{hf_outputs=}") - print(f"{vllm_outputs=}") check_logprobs_close( outputs_0_lst=hf_outputs, outputs_1_lst=vllm_outputs, From 42c802aa62c98861dbe7c15d77ddfe03fb465d3c Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 18:05:21 -0300 Subject: [PATCH 54/63] fix refactoring bug Signed-off-by: Max de Bayser --- vllm/v1/worker/gpu_model_runner.py | 9 +++++---- vllm/v1/worker/gpu_worker.py | 7 ++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index d4ed0b297f36..0f33ed958644 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1304,8 +1304,9 @@ def execute_model( "Either all or none of the requests in" \ " a batch must be pooling request" - extracted_hidden_states = torch.split( - hidden_states, num_scheduled_tokens_np.tolist()) + extracted_hidden_states = list( + torch.split(hidden_states, + num_scheduled_tokens_np.tolist())) pooling_metadata = self.input_batch.pooling_metadata @@ -2004,8 +2005,8 @@ def _dummy_pooler_run( assert sum(num_scheduled_tokens_list) == num_tokens assert len(num_scheduled_tokens_list) == num_reqs - hidden_states_list = torch.split(hidden_states, - num_scheduled_tokens_list) + hidden_states_list = list( + torch.split(hidden_states, num_scheduled_tokens_list)) req_num_tokens = num_tokens // num_reqs diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index e79d1488cdfb..bb7ecada463f 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -269,13 +269,10 @@ def compile_or_warm_up_model(self) -> None: max_num_reqs = min(self.scheduler_config.max_num_seqs, self.scheduler_config.max_num_batched_tokens) - hidden_states, last_hidden_states, \ - num_reqs, num_scheduled_tokens = \ + hidden_states, last_hidden_states = \ self.model_runner._dummy_run(num_tokens=max_num_reqs) if self.model_runner.is_pooling_model: - self.model_runner._dummy_pooler_run(max_num_reqs, num_reqs, - hidden_states, - num_scheduled_tokens) + self.model_runner._dummy_pooler_run(hidden_states) else: self.model_runner._dummy_sampler_run( hidden_states=last_hidden_states) From f771a197b8e9149a31a579e30ba63b1e21743ef4 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 18:33:02 -0300 Subject: [PATCH 55/63] fix refactoring bug Signed-off-by: Max de Bayser --- vllm/v1/core/sched/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 737694f8a304..42ec95091f96 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -15,9 +15,11 @@ def check_stop(request: Request, request.status = RequestStatus.FINISHED_LENGTH_CAPPED return True - if request.pooling_params and pooler_output is not None: - request.status = RequestStatus.FINISHED_STOPPED - return True + if request.pooling_params: + if pooler_output is not None: + request.status = RequestStatus.FINISHED_STOPPED + return True + return False sampling_params = request.sampling_params assert sampling_params is not None From 02c47adfecb1ed86fb3acdd26dee94ec7267a346 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 21:43:11 -0300 Subject: [PATCH 56/63] Fix default chunked prefill for pooling models Signed-off-by: Max de Bayser --- vllm/engine/arg_utils.py | 13 +++++++++---- vllm/v1/core/sched/scheduler.py | 26 ++++++++++++++++++++------ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a2e64fdcc961..59a055138887 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1040,7 +1040,7 @@ def create_engine_config( # Set default arguments for V0 or V1 Engine. if use_v1: - self._set_default_args_v1(usage_context) + self._set_default_args_v1(usage_context, model_config) else: self._set_default_args_v0(model_config) @@ -1515,11 +1515,16 @@ def _set_default_args_v0(self, model_config: ModelConfig) -> None: if self.max_num_seqs is None: self.max_num_seqs = 256 - def _set_default_args_v1(self, usage_context: UsageContext) -> None: + def _set_default_args_v1(self, usage_context: UsageContext, + model_config: ModelConfig) -> None: """Set Default Arguments for V1 Engine.""" - # V1 always uses chunked prefills. - self.enable_chunked_prefill = True + # V1 always uses chunked prefills for non-pooling tasks. + # For pooling tasks the default is False + if model_config.runner_type != "pooling": + self.enable_chunked_prefill = True + elif self.enable_chunked_prefill is None: + self.enable_chunked_prefill = False # V1 enables prefix caching by default. if self.enable_prefix_caching is None: diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d23704b2e872..d86928f61183 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -203,13 +203,18 @@ def schedule(self) -> SchedulerOutput: num_new_tokens): num_new_tokens = ( self.scheduler_config.long_prefill_token_threshold) - num_new_tokens = min(num_new_tokens, token_budget) - # Make sure the input position does not exceed the max model len. - # This is necessary when using spec decoding. - num_new_tokens = min( - num_new_tokens, - self.max_model_len - request.num_computed_tokens) + remaining_len = self.max_model_len - request.num_computed_tokens + + if not self.scheduler_config.chunked_prefill_enabled: + if num_new_tokens > min(token_budget, remaining_len): + num_new_tokens = 0 + else: + num_new_tokens = min(num_new_tokens, token_budget) + + # Make sure the input position does not exceed the max model + # len. This is necessary when using spec decoding. + num_new_tokens = min(num_new_tokens, remaining_len) # Schedule encoder inputs. encoder_inputs_to_schedule = None @@ -402,6 +407,15 @@ def schedule(self) -> SchedulerOutput: < num_new_tokens): num_new_tokens = ( self.scheduler_config.long_prefill_token_threshold) + + # chunked prefill has to be enabled explicitly to allow + # pooling requests to be chunked + if not self.scheduler_config.chunked_prefill_enabled and \ + num_new_tokens > token_budget: + self.waiting.popleft() + skipped_waiting_requests.appendleft(request) + continue + num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 From c5c0d973b206a060e014ec52f1a48970d209c5cc Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 12 Jun 2025 21:53:29 -0300 Subject: [PATCH 57/63] Revert handling of case that can never happen Signed-off-by: Max de Bayser --- vllm/v1/core/sched/scheduler.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d86928f61183..e22dc7001009 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -203,18 +203,13 @@ def schedule(self) -> SchedulerOutput: num_new_tokens): num_new_tokens = ( self.scheduler_config.long_prefill_token_threshold) + num_new_tokens = min(num_new_tokens, token_budget) - remaining_len = self.max_model_len - request.num_computed_tokens - - if not self.scheduler_config.chunked_prefill_enabled: - if num_new_tokens > min(token_budget, remaining_len): - num_new_tokens = 0 - else: - num_new_tokens = min(num_new_tokens, token_budget) - - # Make sure the input position does not exceed the max model - # len. This is necessary when using spec decoding. - num_new_tokens = min(num_new_tokens, remaining_len) + # Make sure the input position does not exceed the max model len. + # This is necessary when using spec decoding. + num_new_tokens = min( + num_new_tokens, + self.max_model_len - request.num_computed_tokens) # Schedule encoder inputs. encoder_inputs_to_schedule = None From acfc9cc9fffe7d254b496bafa3595a23e48420e7 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 13 Jun 2025 10:16:47 -0300 Subject: [PATCH 58/63] fix small bug Signed-off-by: Max de Bayser --- vllm/v1/worker/gpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4121aaa597af..38b1fc6d119f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1311,8 +1311,9 @@ def execute_model( "Either all or none of the requests in" \ " a batch must be pooling request" + total_len = num_scheduled_tokens_np.sum() extracted_hidden_states = list( - torch.split(hidden_states, + torch.split(hidden_states[:total_len], num_scheduled_tokens_np.tolist())) pooling_metadata = self.input_batch.pooling_metadata From 225b808c827c5d46b97a087e46e2c1846f29bfbc Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 13 Jun 2025 11:17:09 -0300 Subject: [PATCH 59/63] fix small bugs Signed-off-by: Max de Bayser --- vllm/engine/arg_utils.py | 16 ++++++++++------ vllm/model_executor/models/qwen3.py | 7 ++++++- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 59a055138887..170d1638e37c 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1519,16 +1519,20 @@ def _set_default_args_v1(self, usage_context: UsageContext, model_config: ModelConfig) -> None: """Set Default Arguments for V1 Engine.""" - # V1 always uses chunked prefills for non-pooling tasks. + # V1 always uses chunked prefills and prefix caching + # for non-pooling tasks. # For pooling tasks the default is False if model_config.runner_type != "pooling": self.enable_chunked_prefill = True - elif self.enable_chunked_prefill is None: - self.enable_chunked_prefill = False - - # V1 enables prefix caching by default. - if self.enable_prefix_caching is None: self.enable_prefix_caching = True + else: + if self.enable_chunked_prefill is None: + self.enable_chunked_prefill = False + if self.enable_prefix_caching is None: + self.enable_prefix_caching = False + + if not self.enable_chunked_prefill: + self.max_num_batched_tokens = model_config.max_model_len # V1 should use the new scheduler by default. # Swap it only if this arg is set to the original V0 default diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index bad0f6b1ffb7..216c1f1c7ff7 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -375,7 +375,12 @@ def pooler( ) -> Optional[PoolerOutput]: hidden_states = self._pooler.extract_states(hidden_states, pooling_metadata) - logits, _ = self.score(hidden_states) + + if isinstance(hidden_states, list): + logits = [self.score(state)[0] for state in hidden_states] + else: + logits, _ = self.score(hidden_states) + pooled_data = self._pooler.head(logits, pooling_metadata) pooled_outputs = [ self._pooler.build_output(data.squeeze(-1)) for data in pooled_data From 2b86c130de296cd42b1581ba0d6c50e7e7b19f54 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 13 Jun 2025 13:42:50 -0300 Subject: [PATCH 60/63] fix silly mistake Signed-off-by: Max de Bayser --- vllm/engine/arg_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 170d1638e37c..49633435642e 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1524,7 +1524,8 @@ def _set_default_args_v1(self, usage_context: UsageContext, # For pooling tasks the default is False if model_config.runner_type != "pooling": self.enable_chunked_prefill = True - self.enable_prefix_caching = True + if self.enable_prefix_caching is None: + self.enable_prefix_caching = True else: if self.enable_chunked_prefill is None: self.enable_chunked_prefill = False From 2983252f92714f5bc26e9823608054edac21e028 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Fri, 13 Jun 2025 13:44:02 -0300 Subject: [PATCH 61/63] reduce memory usage for small ci gpus Signed-off-by: Max de Bayser --- examples/offline_inference/basic/embed.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index fc5ca23787be..1114033d5cea 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -12,7 +12,10 @@ def parse_args(): parser = EngineArgs.add_cli_args(parser) # Set example specific arguments parser.set_defaults( - model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True + model="intfloat/e5-mistral-7b-instruct", + task="embed", + enforce_eager=True, + max_model_len=1024, ) return parser.parse_args() From 878d56a9d9d35ad0b4ef7a427004cd341a446e40 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Sat, 14 Jun 2025 10:45:06 -0300 Subject: [PATCH 62/63] enable chunked prefill by default for models that support it Signed-off-by: Max de Bayser --- vllm/engine/arg_utils.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 56db7d98b734..e25f6f52e4d4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1527,10 +1527,18 @@ def _set_default_args_v1(self, usage_context: UsageContext, if self.enable_prefix_caching is None: self.enable_prefix_caching = True else: + + pooling_type = model_config.pooler_config.pooling_type + + # TODO: when encoder models are supported we'll have to + # check for causal attention here. + incremental_prefill_supported = (pooling_type is not None and \ + pooling_type.lower() == "last") + if self.enable_chunked_prefill is None: - self.enable_chunked_prefill = False + self.enable_chunked_prefill = incremental_prefill_supported if self.enable_prefix_caching is None: - self.enable_prefix_caching = False + self.enable_prefix_caching = incremental_prefill_supported if not self.enable_chunked_prefill: self.max_num_batched_tokens = model_config.max_model_len From bc0219d6fb645927fddbb2c1e61be31b29788b85 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 16 Jun 2025 16:00:30 -0300 Subject: [PATCH 63/63] address review comments Signed-off-by: Max de Bayser --- vllm/engine/arg_utils.py | 9 +++- vllm/v1/worker/gpu_model_runner.py | 86 +++++++++++++++++------------- 2 files changed, 55 insertions(+), 40 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index e25f6f52e4d4..44f14c0cab08 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1532,13 +1532,18 @@ def _set_default_args_v1(self, usage_context: UsageContext, # TODO: when encoder models are supported we'll have to # check for causal attention here. - incremental_prefill_supported = (pooling_type is not None and \ - pooling_type.lower() == "last") + incremental_prefill_supported = (pooling_type is not None and + pooling_type.lower() == "last") + + action = "Enabling" if \ + incremental_prefill_supported else "Disabling" if self.enable_chunked_prefill is None: self.enable_chunked_prefill = incremental_prefill_supported + logger.info("(%s) chunked prefill by default", action) if self.enable_prefix_caching is None: self.enable_prefix_caching = incremental_prefill_supported + logger.info("(%s) prefix caching by default", action) if not self.enable_chunked_prefill: self.max_num_batched_tokens = model_config.max_model_len diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8cf34a7e9d2b..59a62c151e59 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1197,6 +1197,51 @@ def get_dp_padding(self, dtype=torch.int32) return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding + def _pool( + self, + hidden_states: torch.Tensor, + num_scheduled_tokens: int, + num_scheduled_tokens_np: np.ndarray, + finished_sending: Optional[set[str]], + finished_recving: Optional[set[str]], + ) -> ModelRunnerOutput: + assert self.input_batch.num_reqs ==\ + len(self.input_batch.pooling_params), \ + "Either all or none of the requests in" \ + " a batch must be pooling request" + + extracted_hidden_states = list( + torch.split(hidden_states[:num_scheduled_tokens], + num_scheduled_tokens_np.tolist())) + + pooling_metadata = self.input_batch.pooling_metadata + + raw_pooler_output = self.model.pooler( + hidden_states=extracted_hidden_states, + pooling_metadata=pooling_metadata) + + pooler_output: list[Optional[torch.Tensor]] = [] + seq_lens = self.seq_lens[:self.input_batch.num_reqs] + for raw_output, seq_len, prompt_len in zip( + raw_pooler_output, seq_lens, pooling_metadata.prompt_lens): + + if seq_len == prompt_len: + pooler_output.append(raw_output.data.cpu()) + else: + pooler_output.append(None) + + return ModelRunnerOutput( + req_ids=self.input_batch.req_ids, + req_id_to_index=self.input_batch.req_id_to_index, + sampled_token_ids=[], + spec_token_ids=None, + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=pooler_output, + finished_sending=finished_sending, + finished_recving=finished_recving, + ) + @torch.inference_mode() def execute_model( self, @@ -1328,44 +1373,9 @@ def execute_model( logits = None else: if self.input_batch.pooling_params: - assert self.input_batch.num_reqs ==\ - len(self.input_batch.pooling_params), \ - "Either all or none of the requests in" \ - " a batch must be pooling request" - - total_len = num_scheduled_tokens_np.sum() - extracted_hidden_states = list( - torch.split(hidden_states[:total_len], - num_scheduled_tokens_np.tolist())) - - pooling_metadata = self.input_batch.pooling_metadata - - raw_pooler_output = self.model.pooler( - hidden_states=extracted_hidden_states, - pooling_metadata=pooling_metadata) - - pooler_output: list[Optional[torch.Tensor]] = [] - seq_lens = self.seq_lens[:self.input_batch.num_reqs] - for raw_output, seq_len, prompt_len in zip( - raw_pooler_output, seq_lens, - pooling_metadata.prompt_lens): - - if seq_len == prompt_len: - pooler_output.append(raw_output.data.to("cpu")) - else: - pooler_output.append(None) - - return ModelRunnerOutput( - req_ids=self.input_batch.req_ids, - req_id_to_index=self.input_batch.req_id_to_index, - sampled_token_ids=[], - spec_token_ids=None, - logprobs=None, - prompt_logprobs_dict={}, - pooler_output=pooler_output, - finished_sending=finished_sending, - finished_recving=finished_recving, - ) + return self._pool(hidden_states, num_scheduled_tokens, + num_scheduled_tokens_np, finished_sending, + finished_recving) sample_hidden_states = hidden_states[logits_indices] logits = self.model.compute_logits(sample_hidden_states, None)