Refactors

fhl2000 · fhl2000 · commit c2c5feaf4ea6 · 2025-06-25T16:24:14.000+08:00
Signed-off-by: fhl &lt;2410591650@qq.com&gt;
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
@@ -236,7 +236,9 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
         self.graph_pool = graph_pool
         self.sym_shape_indices = sym_shape_indices
 
-        self.separate_attention_routine = vllm_config.compilation_config.separate_attention_routine
+        self.separate_attention_routine = (
+            vllm_config.compilation_config.separate_attention_routine
+        )
 
         self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
 
@@ -282,7 +284,7 @@ def __call__(self, *args) -> Any:
         # eagerly run the compiled graphs, which should be cudagraph capturable
         # as a whole.
         
-        concrete_size_entries = self.concrete_size_entries  # default as general usage
+        concrete_size_entries = self.concrete_size_entries  
         if self.separate_attention_routine and forward_context.is_pure_decoding:
             concrete_size_entries = self.concrete_size_entries_decode
 
@@ -324,15 +326,16 @@ def __call__(self, *args) -> Any:
             entry.input_addresses = input_addresses
             cudagraph = torch.cuda.CUDAGraph()
 
-            with ExitStack() as stack:
+            with ExitStack(), \
+                torch.cuda.graph(cudagraph, pool=self.graph_pool):
                 # mind-exploding: carefully manage the reference and memory.
-                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                    # `output` is managed by pytorch's cudagraph pool
-                    output = entry.runnable(*args)
-                    # by converting it to weak ref,
-                    # the original `output` will immediately be released
-                    # to save memory. 
-                    output = weak_ref_tensors(output)
+                
+                # `output` is managed by pytorch's cudagraph pool
+                output = entry.runnable(*args)
+                # by converting it to weak ref,
+                # the original `output` will immediately be released
+                # to save memory. 
+                output = weak_ref_tensors(output)
 
             # here we always use weak ref for the output
             # to save memory
diff --git a/vllm/config.py b/vllm/config.py
@@ -3976,11 +3976,11 @@ class CompilationConfig:
     performance benefits for smaller models."""
     separate_attention_routine: bool = False
     """
-    Enable a distinct attention calls routine under an attention backend for full
-    cuda graph capturing. This is because some attention backends like FlashMLA,
-    FlashInfer, FA2, etc. implement different branches for mix prefill-decode and
-    pure decode cases. This flag enables us to potentially capture the cudagraph
-    separately for each branch.
+    Enable a distinct attention calls routine under an attention backend for
+    full cuda graph capturing. This is because some attention backends like
+    FlashMLA, FlashInfer, FA2, etc. implement different branches for mix
+    prefill-decode and pure decode cases. This flag enables us to potentially
+    capture the cudagraph separately for each branch.
     """
 
     pass_config: PassConfig = field(default_factory=PassConfig)
@@ -4187,7 +4187,10 @@ def set_splitting_ops_for_v1(self):
         # the runtime batch_size is not cudagraph captured. This is only
         # supported for separate_attention_routine.
         if self.separate_attention_routine:
-            assert self.full_cuda_graph, "separate_attention_routine requires full_cuda_graph to be True"
+            assert self.full_cuda_graph, (
+                "separate_attention_routine requires "
+                "full_cuda_graph to be True"
+            )
         if not self.splitting_ops:
             self.splitting_ops = [
                 "vllm.unified_attention",
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
@@ -97,7 +97,7 @@ class ForwardContext:
     # determine whether to use a full cudagraph for attention or piecewise 
     # cudagraphs that skip the attention part. By default true, we use piecewise 
     # cudagraphs.
-    skip_attention_cuda_graphs: bool = True,
+    skip_attention_cuda_graphs: bool = True
     is_pure_decoding: bool = False
 
 
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -227,14 +227,17 @@ def __init__(self, runner: GPUModelRunner, kv_cache_spec: AttentionSpec,
         self._prefill_wrapper = None  # Wrapper for prefill/append
         self._decode_wrapper = None  # Wrapper for decode
         self._decode_wrapper = None  # Wrapper for decode (general shape)
-        self.enable_cuda_graph = self.vllm_config.compilation_config.full_cuda_graph
+        self.enable_cuda_graph = (
+            self.vllm_config.compilation_config.full_cuda_graph
+        )
         if self.enable_cuda_graph:
             # For full cudagraph capture, one `decode_wrapper` for each batch
             # size is needed for FlashInfer.
-            self._decode_wrappers_cudagraph: dict[int, BatchDecodeWithPagedKVCacheWrapper] = {} 
-            self._decode_cudagraph_max_bs = min(runner.max_num_reqs, 
-                runner.cudagraph_batch_sizes[-1])
-            
+            self._decode_wrappers_cudagraph: dict[int,
+                BatchDecodeWithPagedKVCacheWrapper] = {} 
+            self._decode_cudagraph_max_bs = min(
+                runner.max_num_reqs, runner.cudagraph_batch_sizes[-1])
+
         self._cascade_wrapper = None  # Wrapper for cascade attention
 
         # Global hyperparameters shared by all attention layers
@@ -446,8 +449,9 @@ def _plan(self, attn_metadata: FlashInferMetadata):
                 use_cudagraph = (self.enable_cuda_graph and pure_decode and \
                         self._num_decodes <= self._decode_cudagraph_max_bs)
                 if use_cudagraph:
-                    num_input_tokens_decode = self.vllm_config.pad_for_cudagraph(
-                        self._num_decodes)
+                    num_input_tokens_decode = (
+                        self.vllm_config.pad_for_cudagraph(self._num_decodes)
+                    )
                 else:
                     num_input_tokens_decode = self._num_decodes
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1339,7 +1339,8 @@ def execute_model(
                 if self.full_cuda_graph else True
         # Note: When skip_attention_cuda_graphs is always False and
         # compilition_config.separate_attention_routine is True, as in FA2,
-        # this flag helps to determine the correct routine to run for the full cudagraph.
+        # this flag helps to determine the correct routine for the full
+        # cudagraph.
         is_pure_decoding = num_scheduled_tokens == self.input_batch.num_reqs
 
         # Run the model.