Fix wrong prefill skip attn metadata

sarckk · sarckk · commit d112b8583569 · 2025-07-02T22:05:39.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/v1/e2e/test_kv_sharing_skip_prefill.py b/tests/v1/e2e/test_kv_sharing_skip_prefill.py
@@ -194,6 +194,7 @@ def forward(
         if decode_indices is None:
             decode_indices = torch.arange(positions.size(0),
                                           device=positions.device)
+
         num_decodes = decode_indices.shape[0]
         assert num_decodes >= 1
         assert first_residual is not None
@@ -270,12 +271,14 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 
 @fork_new_process_for_each_test
-@pytest.mark.parametrize("enforce_eager", [False, True])
-def test_kv_sharing_skip_prefill(monkeypatch, enforce_eager):
-    prompt = "What is the capital of France?"
+@pytest.mark.parametrize("enforce_eager", [True, False])
+def test_kv_sharing_skip_prefill(
+    monkeypatch: pytest.MonkeyPatch,
+    enforce_eager: bool,
+):
     ModelRegistry.register_model("Qwen2ForCausalLM", TestQwen2ForCausalLM)
     sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
-    single_prompt = [prompt]
+    prompts = ["What is the capital of France?"]
     compilation_config = CompilationConfig(
         level=CompilationLevel.PIECEWISE
         if not enforce_eager else CompilationLevel.NO_COMPILATION,
@@ -284,21 +287,22 @@ def test_kv_sharing_skip_prefill(monkeypatch, enforce_eager):
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-                  enforce_eager=enforce_eager,
-                  compilation_config=compilation_config)
-        responses = llm.generate(single_prompt, sampling_params)
+        llm = LLM(
+            model="Qwen/Qwen2-1.5B-Instruct",
+            enforce_eager=enforce_eager,
+            compilation_config=compilation_config,
+        )
+        responses = llm.generate(prompts, sampling_params)
         ref_output = responses[0].outputs[0].text
 
         del llm
         gc.collect()
         torch.cuda.empty_cache()
 
-        m.setenv("VLLM_V1_KV_SHARING_SKIP_PREFILL", "1")
-
         llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
                   enforce_eager=enforce_eager,
-                  compilation_config=compilation_config)
-        responses = llm.generate(single_prompt, sampling_params)
+                  compilation_config=compilation_config,
+                  kv_sharing_skip_prefill=True)
+        responses = llm.generate(prompts, sampling_params)
         output = responses[0].outputs[0].text
         assert output == ref_output
diff --git a/vllm/config.py b/vllm/config.py
@@ -1528,6 +1528,10 @@ class CacheConfig:
     checkpoint if available. Otherwise, the scales will default to 1.0."""
     cpu_kvcache_space_bytes: Optional[int] = None
     """(CPU backend only) CPU key-value cache space."""
+    kv_sharing_skip_prefill: bool = False
+    """Skip prefill for tokens where applicable in KV cache sharing
+    scenarios where required key/value tensors have been populated
+    in earlier KV sharing target layers."""
 
     # Will be set after profiling.
     num_gpu_blocks: Optional[int] = field(default=None, init=False)
@@ -4066,7 +4070,10 @@ class CompilationConfig:
     - None (default): capture sizes are inferred from vllm config.
     - list[int]: capture sizes are specified as given."""
     cudagraph_share_memory_pool: bool = True
-    """Whether to share a single global memory pool for each graph capture"""
+    """Whether to share a single global memory pool for each graph capture
+    When CUDA graphs are not replayed in the same order they are captured,
+    e.g. when compiling multiple modules in a model and modules take different
+    input shapes, it is unsafe to share memory across graph captures."""
     cudagraph_copy_inputs: bool = False
     """Whether to copy input tensors for
     cudagraph. If the caller can guarantee that the same input buffers
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -459,6 +459,7 @@ class EngineArgs:
     override_attention_dtype: str = ModelConfig.override_attention_dtype
 
     calculate_kv_scales: bool = CacheConfig.calculate_kv_scales
+    kv_sharing_skip_prefill: bool = CacheConfig.kv_sharing_skip_prefill
 
     additional_config: dict[str, Any] = \
         get_field(VllmConfig, "additional_config")
@@ -735,6 +736,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                                  **cache_kwargs["cpu_offload_gb"])
         cache_group.add_argument("--calculate-kv-scales",
                                  **cache_kwargs["calculate_kv_scales"])
+        cache_group.add_argument("--kv-sharing-skip-prefill",
+                                 **cache_kwargs["kv_sharing_skip_prefill"])
 
         # Tokenizer arguments
         tokenizer_kwargs = get_kwargs(TokenizerPoolConfig)
@@ -1120,6 +1123,7 @@ def create_engine_config(
             prefix_caching_hash_algo=self.prefix_caching_hash_algo,
             cpu_offload_gb=self.cpu_offload_gb,
             calculate_kv_scales=self.calculate_kv_scales,
+            kv_sharing_skip_prefill=self.kv_sharing_skip_prefill,
         )
 
         # Get the current placement group if Ray is initialized and
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -191,6 +191,7 @@ def __init__(
         override_pooler_config: Optional[PoolerConfig] = None,
         compilation_config: Optional[Union[int, dict[str, Any],
                                            CompilationConfig]] = None,
+        kv_sharing_skip_prefill: bool = False,
         **kwargs,
     ) -> None:
         """LLM constructor."""
@@ -264,6 +265,7 @@ def __init__(
             mm_processor_kwargs=mm_processor_kwargs,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
+            kv_sharing_skip_prefill=kv_sharing_skip_prefill,
             **kwargs,
         )
 
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -138,7 +138,6 @@
     VLLM_ROCM_QUICK_REDUCE_QUANTIZATION: str = "NONE"
     VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
     VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
-    VLLM_V1_KV_SHARING_SKIP_PREFILL: bool = False
 
 
 def get_default_cache_root():
@@ -955,8 +954,6 @@ def get_vllm_port() -> Optional[int]:
     # models
     "VLLM_USE_NVFP4_CT_EMULATIONS":
     lambda: bool(int(os.getenv("VLLM_USE_NVFP4_CT_EMULATIONS", "0"))),
-    "VLLM_V1_KV_SHARING_SKIP_PREFILL":
-    lambda: os.environ.get("VLLM_V1_KV_SHARING_SKIP_PREFILL", "0") == "1",
 }
 
 # --8<-- [end:env-vars-definition]
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -256,6 +256,16 @@ def build(
         common_prefix_len: int,
         common_attn_metadata: CommonAttentionMetadata,
     ) -> FlashAttentionMetadata:
+        prefill_skipped_attn_metadata = None
+        if common_attn_metadata.decode_indices is not None:
+            # NOTE(sarckk): attention metadata for partial prefill skip case
+            # needs to be built first, otherwise the line below
+            # block_table.slot_mapping[num_actual_tokens:].fill_(-1)
+            # will override the correct slot mapping
+            prefill_skipped_attn_metadata = self.build_skip_prefill(
+                common_prefix_len=0,  # disable cascade attention
+                common_attn_metadata=common_attn_metadata)
+
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
@@ -404,12 +414,6 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
             # we only set num_splits when using cuda graphs.
             max_num_splits = self.max_num_splits
 
-        prefill_skipped_attn_metadata = None
-        if common_attn_metadata.decode_indices is not None:
-            prefill_skipped_attn_metadata = self.build_skip_prefill(
-                common_prefix_len=0,  # disable cascade attention
-                common_attn_metadata=common_attn_metadata)
-
         attn_metadata = FlashAttentionMetadata(
             num_actual_tokens=num_actual_tokens,
             max_query_len=max_query_len,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -580,6 +580,8 @@ def _calc_decode_indices(self, logits_indices: torch.Tensor):
         """
         Pads logits_indices to align with CUDA graph capture sizes
         """
+        if not self.cache_config.kv_sharing_skip_prefill:
+            return None
         num_decodes = logits_indices.shape[0]
         # TODO(sarckk): With chunked prefills, logits_indices contains
         # indices for partial requests though we do not sample any token
@@ -599,8 +601,9 @@ def _calc_decode_indices(self, logits_indices: torch.Tensor):
     def _prepare_inputs(
         self,
         scheduler_output: "SchedulerOutput",
-    ) -> tuple[dict[str, Any], bool, torch.Tensor,
-               Optional[SpecDecodeMetadata], np.ndarray, torch.Tensor]:
+    ) -> tuple[dict[str,
+                    Any], bool, torch.Tensor, Optional[SpecDecodeMetadata],
+               np.ndarray, Optional[torch.Tensor]]:
         """
         :return: tuple[
             attn_metadata: layer-to-attention_metadata mapping,