vllm-project
diff --git a/‎tests/v1/e2e/test_kv_sharing_skip_prefill.py
Lines changed: 106 additions & 38 deletions b/‎tests/v1/e2e/test_kv_sharing_skip_prefill.py
Lines changed: 106 additions & 38 deletions
diff --git a/‎vllm/compilation/backends.py
Lines changed: 5 additions & 2 deletions b/‎vllm/compilation/backends.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎vllm/compilation/decorators.py
Lines changed: 8 additions & 1 deletion b/‎vllm/compilation/decorators.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎vllm/config.py
Lines changed: 2 additions & 0 deletions b/‎vllm/config.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm/envs.py
Lines changed: 0 additions & 1 deletion b/‎vllm/envs.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/forward_context.py
Lines changed: 2 additions & 0 deletions b/‎vllm/forward_context.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎vllm/v1/attention/backends/cpu_attn.py
Lines changed: 4 additions & 17 deletions b/‎vllm/v1/attention/backends/cpu_attn.py
Lines changed: 4 additions & 17 deletions
@@ -3,15 +3,19 @@
 
 import gc
 from collections.abc import Iterable
-from typing import Optional, Union
+from typing import List, Optional, Union
 
 import pytest
 import torch
 from torch import nn
 from transformers import Qwen2Config
 
 from vllm import LLM, SamplingParams
-from vllm.config import CacheConfig, VllmConfig
+from vllm.compilation.backends import set_model_tag
+from vllm.compilation.decorators import (skip_torch_compile,
+                                         support_torch_compile)
+from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
+                         VllmConfig)
 from vllm.forward_context import get_forward_context
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -52,6 +56,7 @@ def __init__(
             target_layer_idx = layer_idx % 5
             kv_sharing_target_layer_name = f"{attn_prefix}.attn".replace(
                 str(layer_idx), str(target_layer_idx))
+
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
@@ -99,8 +104,72 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
+class DecoderLayerGroup(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        layers: List[nn.Module],
+    ):
+        super().__init__()
+        self.layers = layers
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ):
+        for layer in self.layers:
+            hidden_states, residual = layer(
+                positions,
+                hidden_states,
+                residual,
+            )
+        return hidden_states, residual
+
+
+@skip_torch_compile
 class Qwen2ModelWithKVSharing(Qwen2Model):
 
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 decoder_layer_type: type[
+                     nn.Module] = Qwen2DecoderLayerWithKVSharing):
+        super().__init__(
+            vllm_config=vllm_config,
+            prefix=prefix,
+            decoder_layer_type=decoder_layer_type,
+        )
+
+        with set_model_tag("first_layer_group"):
+            self.first_layer_group = DecoderLayerGroup(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.first_layer_group",
+                layers=self.layers[self.start_layer:START_KV_SHARING_LAYER],
+            )
+
+        with set_model_tag("second_layer_group"):
+            self.second_layer_group = DecoderLayerGroup(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.second_layer_group",
+                layers=self.layers[START_KV_SHARING_LAYER:self.end_layer],
+            )
+
+        # Pre-allocate static buffers for CUDA graph
+        self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+        self.dtype = vllm_config.model_config.dtype
+        self.device = next(self.parameters()).device
+        self.hidden_size = vllm_config.model_config.get_hidden_size()
+        self.residual = torch.zeros((self.max_num_tokens, self.hidden_size),
+                                    dtype=self.dtype,
+                                    device=self.device)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -112,46 +181,40 @@ def forward(
             hidden_states = inputs_embeds
         else:
             hidden_states = self.get_input_embeddings(input_ids)
+
         residual = None
+        first_hidden_states, first_residual = self.first_layer_group(
+            positions,
+            hidden_states,
+            residual,  # no residual, assume no pipeline parallel
+        )
 
         decode_indices = get_forward_context().decode_indices
         if decode_indices is None:
             decode_indices = torch.arange(positions.size(0),
                                           device=positions.device)
-
-        # Forward with full inputs up to the first layer that shares KV cache
-        for layer in self.layers[self.start_layer:START_KV_SHARING_LAYER]:
-            hidden_states, residual = layer(
-                positions,
-                hidden_states,
-                residual,
-            )
-
-        if decode_indices is not None:
-            decode_hidden_states = hidden_states[decode_indices]
-            decode_positions = positions[decode_indices]
-            decode_residual = (residual[decode_indices]
-                               if residual is not None else None)
-        else:
-            decode_hidden_states = hidden_states
-            decode_positions = positions
-            decode_residual = residual
-
-        # Optimization: forward with partial inputs only for last N layers
-        for layer in self.layers[START_KV_SHARING_LAYER:self.end_layer]:
-            decode_hidden_states, decode_residual = layer(
-                decode_positions,
-                decode_hidden_states,
-                decode_residual,
-            )
+        num_decodes = decode_indices.shape[0]
+        assert num_decodes >= 1
+        assert first_residual is not None
+
+        # CUDA graph expects static tensor addresses
+        # Copy output of first layer group to second layer group
+        self.residual[:num_decodes].copy_(first_residual[decode_indices])
+        hidden_states[:num_decodes].copy_(first_hidden_states[decode_indices])
+        positions[:num_decodes].copy_(positions[decode_indices])
+
+        second_hidden_states, second_residual = self.second_layer_group(
+            positions[:num_decodes],
+            hidden_states[:num_decodes],
+            self.residual[:num_decodes],
+        )
 
         # Merge results back
-        if decode_hidden_states is not None:
-            hidden_states[decode_indices] = decode_hidden_states
-            if residual is not None:
-                residual[decode_indices] = decode_residual
+        first_hidden_states[decode_indices] = second_hidden_states
+        if first_residual is not None:
+            first_residual[decode_indices] = second_residual
 
-        hidden_states, _ = self.norm(hidden_states, residual)
+        hidden_states, _ = self.norm(first_hidden_states, first_residual)
         return hidden_states
 
 
@@ -205,20 +268,24 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loader.load_weights(weights)
 
 
-# TODO: make it work with torch.compile
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("enforce_eager", [True])
+@pytest.mark.parametrize("enforce_eager", [False, True])
 def test_kv_sharing_skip_prefill(monkeypatch, enforce_eager):
     prompt = "What is the capital of France?"
     ModelRegistry.register_model("Qwen2ForCausalLM", TestQwen2ForCausalLM)
-    sampling_params = SamplingParams(temperature=0.0, max_tokens=40)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     single_prompt = [prompt]
+    compilation_config = CompilationConfig(
+        level=CompilationLevel.PIECEWISE
+        if not enforce_eager else CompilationLevel.NO_COMPILATION,
+        cudagraph_share_memory_pool=False)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  enforce_eager=enforce_eager)
+                  enforce_eager=enforce_eager,
+                  compilation_config=compilation_config)
         responses = llm.generate(single_prompt, sampling_params)
         ref_output = responses[0].outputs[0].text
 
@@ -229,7 +296,8 @@ def test_kv_sharing_skip_prefill(monkeypatch, enforce_eager):
         m.setenv("VLLM_V1_KV_SHARING_SKIP_PREFILL", "1")
 
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  enforce_eager=enforce_eager)
+                  enforce_eager=enforce_eager,
+                  compilation_config=compilation_config)
         responses = llm.generate(single_prompt, sampling_params)
         output = responses[0].outputs[0].text
         assert output == ref_output
@@ -391,8 +391,11 @@ def __init__(
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
-        global global_graph_pool
-        if global_graph_pool is None:
+        if vllm_config.compilation_config.cudagraph_share_memory_pool:
+            global global_graph_pool
+            if global_graph_pool is None:
+                global_graph_pool = current_platform.graph_pool_handle()
+        else:
             global_graph_pool = current_platform.graph_pool_handle()
 
         # TODO: in the future, if we want to use multiple
 
@@ -23,6 +23,13 @@
 _T = TypeVar("_T", bound=type[nn.Module])
 
 
+def skip_torch_compile(cls: _T) -> _T:
+    cls._skip_compile_vllm = True
+    for base in cls.__bases__:
+        base._skip_compile_vllm = True
+    return cls
+
+
 @overload
 def support_torch_compile(
     *,
@@ -156,7 +163,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         self.do_not_compile = \
             vllm_config.compilation_config.level in [
             CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
-        ] or not supports_dynamo()
+        ] or not supports_dynamo() or getattr(self, "_skip_compile_vllm", False)
         if self.do_not_compile:
             return
         compilation_counter.num_models_seen += 1
 
@@ -4029,6 +4029,8 @@ class CompilationConfig:
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
+    cudagraph_share_memory_pool: bool = True
+    """Whether to share a single global memory pool for each CUDA graph captured"""
     cudagraph_copy_inputs: bool = False
     """Whether to copy input tensors for
     cudagraph. If the caller can guarantee that the same input buffers
 
@@ -962,7 +962,6 @@ def get_vllm_port() -> Optional[int]:
     # models
     "VLLM_USE_NVFP4_CT_EMULATIONS":
     lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
-
     "VLLM_V1_KV_SHARING_SKIP_PREFILL":
     lambda: os.environ.get("VLLM_V1_KV_SHARING_SKIP_PREFILL", "0") == "1",
 }
 
@@ -95,7 +95,9 @@ class ForwardContext:
     # set dynamically for each forward pass
     dp_metadata: Optional[DPMetadata] = None
     skip_cuda_graphs: bool = False
+
     decode_indices: Optional[torch.Tensor] = None
+    """indices used for decoding"""
 
 
 _forward_context: Optional[ForwardContext] = None
 
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from typing import Optional
-
 import numpy as np
 import torch
 
@@ -121,22 +119,11 @@ def reorder_batch(self, input_batch: InputBatch,
 
         return True
 
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        decode_only_common_attn_metadata: Optional[
-            CommonAttentionMetadata] = None,
-    ):
-        if decode_only_common_attn_metadata is not None:
-            raise NotImplementedError(
-                "CPU backend does not support decode-only attention yet.")
+    def build(self, common_prefix_len: int,
+              common_attn_metadata: CommonAttentionMetadata):
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
-        query_start_loc_np = (common_attn_metadata.query_start_loc_np
-                              if common_attn_metadata.query_start_loc_np
-                              is not None else self.runner.query_start_loc_np)
 
         runner = self.runner
         block_table = self.block_table
@@ -148,8 +135,8 @@ def build(
         ) if num_prompt_req < num_reqs else 0
         self.seq_start_loc_np[0] = 0
         np.cumsum(seq_lens_np, out=self.seq_start_loc_np[1:num_reqs + 1])
-        num_prefill_tokens = query_start_loc_np[num_prompt_req].item()
-        num_decode_tokens = query_start_loc_np[num_reqs].item(
+        num_prefill_tokens = runner.query_start_loc_np[num_prompt_req].item()
+        num_decode_tokens = runner.query_start_loc_np[num_reqs].item(
         ) - num_prefill_tokens
         slot_mapping = block_table.slot_mapping_cpu[:num_actual_tokens].long()
         block_table_tensor = block_table.get_device_tensor()
Original file line number	Diff line number	Diff line change
`@@ -962,7 +962,6 @@ def get_vllm_port() -> Optional[int]:`
`962`	`962`	`# models`
`963`	`963`	`"VLLM_USE_NVFP4_CT_EMULATIONS":`
`964`	`964`	`lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),`
`965`		`-`
`966`	`965`	`"VLLM_V1_KV_SHARING_SKIP_PREFILL":`
`967`	`966`	`lambda: os.environ.get("VLLM_V1_KV_SHARING_SKIP_PREFILL", "0") == "1",`
`968`	`967`	`}`