Address comments

sarckk · sarckk · commit 3cd2474be45e · 2025-07-11T14:18:39.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/v1/e2e/test_kv_sharing_skip_prefill.py b/tests/v1/e2e/test_kv_sharing_skip_prefill.py
@@ -13,7 +13,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.compilation.backends import set_model_tag
-from vllm.compilation.decorators import (skip_torch_compile,
+from vllm.compilation.decorators import (ignore_torch_compile,
                                          support_torch_compile)
 from vllm.config import (CacheConfig, CompilationConfig, CompilationLevel,
                          VllmConfig)
@@ -161,7 +161,7 @@ def forward(
         return hidden_states, residual
 
 
-@skip_torch_compile
+@ignore_torch_compile
 class Qwen2ModelWithKVSharing(Qwen2Model):
 
     def __init__(self,
@@ -193,18 +193,17 @@ def __init__(self,
             )
 
         # Pre-allocate static buffers for CUDA graph
-        self.max_num_tokens =\
-            vllm_config.scheduler_config.max_num_batched_tokens
-        self.dtype = vllm_config.model_config.dtype
-        self.device = next(self.parameters()).device
-        self.hidden_size = vllm_config.model_config.get_hidden_size()
-        self.residual = torch.zeros((self.max_num_tokens, self.hidden_size),
-                                    dtype=self.dtype,
-                                    device=self.device)
+        max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens
+        dtype = vllm_config.model_config.dtype
+        device = next(self.parameters()).device
+        hidden_size = vllm_config.model_config.get_hidden_size()
+        self.residual = torch.zeros((max_num_tokens, hidden_size),
+                                    dtype=dtype,
+                                    device=device)
         self.hidden_states = torch.zeros(
-            (self.max_num_tokens, self.hidden_size),
-            dtype=self.dtype,
-            device=self.device)
+            (max_num_tokens, hidden_size),
+            dtype=dtype,
+            device=device)
 
     def forward(
         self,
@@ -355,8 +354,7 @@ def test_kv_sharing_skip_prefill(
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     compilation_config = CompilationConfig(
         level=CompilationLevel.PIECEWISE
-        if not enforce_eager else CompilationLevel.NO_COMPILATION,
-        cudagraph_share_memory_pool=False)
+        if not enforce_eager else CompilationLevel.NO_COMPILATION)
 
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -412,11 +412,8 @@ def __init__(
         # them, e.g. backbone (default), eagle_head, etc.
         self.prefix = prefix or model_tag
 
-        if vllm_config.compilation_config.cudagraph_share_memory_pool:
-            global global_graph_pool
-            if global_graph_pool is None:
-                global_graph_pool = current_platform.graph_pool_handle()
-        else:
+        global global_graph_pool
+        if global_graph_pool is None:
             global_graph_pool = current_platform.graph_pool_handle()
 
         # TODO: in the future, if we want to use multiple
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
@@ -23,8 +23,8 @@
 _T = TypeVar("_T", bound=type[nn.Module])
 
 
-def skip_torch_compile(cls: _T) -> _T:
-    cls._skip_compile_vllm = True
+def ignore_torch_compile(cls: _T) -> _T:
+    cls._ignore_compile_vllm = True
     return cls
 
 
@@ -161,7 +161,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         self.do_not_compile = \
             vllm_config.compilation_config.level in [
             CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
-        ] or not supports_dynamo() or getattr(self, "_skip_compile_vllm", False)
+        ] or not supports_dynamo() or getattr(self, "_ignore_compile_vllm", False)
         if self.do_not_compile:
             return
         compilation_counter.num_models_seen += 1
diff --git a/vllm/config.py b/vllm/config.py
@@ -4118,11 +4118,6 @@ class CompilationConfig:
     """Sizes to capture cudagraph.
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
-    cudagraph_share_memory_pool: bool = True
-    """Whether to share a single global memory pool for each graph capture
-    When CUDA graphs are not replayed in the same order they are captured,
-    e.g. when compiling multiple modules in a model and modules take different
-    input shapes, it is unsafe to share memory across graph captures."""
     cudagraph_copy_inputs: bool = False
     """Whether to copy input tensors for
     cudagraph. If the caller can guarantee that the same input buffers
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -317,9 +317,11 @@ def __init__(
         # from the KV cache of `shared_kv_cache_layers[layer_name]`.
         self.shared_kv_cache_layers: dict[str, str] = {}
 
-        self.decode_indices = torch.zeros(self.max_num_tokens,
-                                          dtype=torch.int32,
-                                          device=self.device)
+        self.decode_indices = None
+        if self.cache_config.kv_sharing_skip_prefill:
+            self.decode_indices = torch.zeros(self.max_num_tokens,
+                                            dtype=torch.int32,
+                                            device=self.device)
 
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
@@ -583,7 +585,7 @@ def _calc_decode_indices(self, logits_indices: torch.Tensor):
         """
         Pads logits_indices to align with CUDA graph capture sizes
         """
-        if not self.cache_config.kv_sharing_skip_prefill:
+        if self.decode_indices is None:
             return None
 
         num_decode_reqs = 0