optimization to skip computing extra metadata if all requests on decode

sarckk · sarckk · commit a88cf11e37f5 · 2025-07-02T22:05:39.000-07:00
Signed-off-by: Yong Hoon Shin &lt;yhshin@meta.com&gt;
diff --git a/tests/v1/e2e/test_kv_sharing_skip_prefill.py b/tests/v1/e2e/test_kv_sharing_skip_prefill.py
@@ -278,7 +278,7 @@ def test_kv_sharing_skip_prefill(
     test_prompts: list[list[dict[str, Any]]],
 ):
     ModelRegistry.register_model("Qwen2ForCausalLM", TestQwen2ForCausalLM)
-    sampling_params = SamplingParams(temperature=0.0, max_tokens=42)
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=100)
     prompts = [prompt[0]['content'] for prompt in test_prompts]
     compilation_config = CompilationConfig(
         level=CompilationLevel.PIECEWISE
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -582,6 +582,18 @@ def _calc_decode_indices(self, logits_indices: torch.Tensor):
         """
         if not self.cache_config.kv_sharing_skip_prefill:
             return None
+
+        num_decode_reqs = 0
+        for req_index in range(self.input_batch.num_reqs):
+            if self.input_batch.num_computed_tokens_cpu[
+                    req_index] >= self.input_batch.num_prompt_tokens[
+                        req_index]:
+                num_decode_reqs += 1
+
+        if self.input_batch.num_reqs == num_decode_reqs:
+            # All requests are on decode, skip calculate decode only indices
+            return None
+
         num_decodes = logits_indices.shape[0]
         # TODO(sarckk): With chunked prefills, logits_indices contains
         # indices for partial requests though we do not sample any token