feat: optimize forward metadata collection across dp ranks

jianzs · jianzs · commit dada008edd2d · 2025-07-03T19:54:16.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -621,17 +621,15 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         if batch_changed:
             self.input_batch.refresh_sampling_metadata()
 
-    def _get_forward_metadata_across_dp(
-            self, total_num_scheduled_tokens: int,
-            with_prefill: bool) -> tuple[int, bool]:
-        forward_metadata = torch.tensor(
-            [total_num_scheduled_tokens, with_prefill],
-            device="cpu",
-            dtype=torch.int32)
-        dist.all_reduce(forward_metadata,
-                        op=ReduceOp.MAX,
-                        group=get_dp_group().cpu_group)
-        return int(forward_metadata[0]), bool(forward_metadata[1] > 0)
+    def _get_forward_metadata_across_dp(self, num_tokens: int,
+                                        with_prefill: bool) -> tuple[int, bool]:
+        local_forward_metadata = torch.tensor([num_tokens, with_prefill],
+                                              device="npu", dtype=torch.int32)
+        global_forward_metadata = get_dp_group().all_gather(
+            local_forward_metadata)
+        num_tokens_across_dp = global_forward_metadata[:, 0].cpu()
+        with_prefill = bool(global_forward_metadata[:, 1].any())
+        return num_tokens_across_dp, with_prefill
 
     def get_eagle_atten_dict(
         self,
@@ -1100,9 +1098,12 @@ def _process_reqs(
             AscendAttentionState.DecodeOnly, AscendAttentionState.SpecDecoding
         ]
 
+        num_tokens_across_dp = None
         if self.dp_size > 1:
-            max_num_tokens, with_prefill = self._get_forward_metadata_across_dp(
-                total_num_scheduled_tokens, with_prefill)
+            num_tokens_across_dp, with_prefill = \
+                self._get_forward_metadata_across_dp(num_input_tokens,
+                                                     with_prefill)
+            max_num_tokens = int(num_tokens_across_dp.max().item())
             extra_builder_kwargs['max_num_tokens_across_dp'] = max_num_tokens
             extra_builder_kwargs['with_prefill_across_dp'] = with_prefill
 
@@ -1111,6 +1112,8 @@ def _process_reqs(
             if self.dp_size > 1:
                 padded_batch_size = self.select_torchair_padded_batch_size(
                     max_num_tokens)
+                num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
+                                                  padded_batch_size)
             else:
                 padded_batch_size = self.select_torchair_padded_batch_size(
                     total_num_scheduled_tokens)
@@ -1189,7 +1192,8 @@ def _process_reqs(
         # Run forward pass
         with set_forward_context(attn_metadata,
                                  self.vllm_config,
-                                 num_tokens=num_input_tokens):
+                                 num_tokens=num_input_tokens,
+                                 num_tokens_across_dp=num_tokens_across_dp):
             with ProfileExecuteDuration().capture_async("forward"):
                 model_kwargs = {}
                 if self.torchair_graph_enabled:
@@ -1806,6 +1810,7 @@ def _dummy_run(
         is_compile: bool = False,
         with_prefill: bool = True,
         skip_attn: bool = True,
+        num_tokens_across_dp: Optional[int] = None,
     ) -> torch.Tensor:
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
@@ -1860,7 +1865,8 @@ def _dummy_run(
 
             with set_forward_context(None,
                                      self.vllm_config,
-                                     num_tokens=num_tokens):
+                                     num_tokens=num_tokens,
+                                     num_tokens_across_dp=num_tokens_across_dp):
                 if self.torchair_graph_enabled and not with_prefill:
                     attn_metadata = self.attn_metadata_builder.build_dummy(
                         num_reqs=num_tokens, num_actual_tokens=1)
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -247,16 +247,29 @@ def pin_lora(self, lora_id: int) -> bool:
 
     def execute_dummy_batch(self) -> None:
         runner = self.model_runner
-        max_num_tokens = 1
+
+        # If torchair graph is enabled, notify the other DP ranks that this is a
+        # dummy run by using '-1' as a flag for num_tokens. This will be
+        # replaced with the final determined graph size before the forward pass.
+        num_tokens = (-1 if runner.torchair_graph_enabled and not with_prefill 
+                      else 1)
+        num_tokens_across_dp = None
         with_prefill = False
+
         if runner.dp_size > 1:
-            max_num_tokens, with_prefill = runner._get_forward_metadata_across_dp(
-                max_num_tokens, with_prefill)
+            num_tokens_across_dp, with_prefill = \
+                runner._get_forward_metadata_across_dp(num_tokens, with_prefill)
+            num_tokens = int(num_tokens_across_dp.max().item())
+
         if runner.torchair_graph_enabled and not with_prefill:
-            max_num_tokens = runner.select_torchair_padded_batch_size(
-                max_num_tokens)
-        runner._dummy_run(max_num_tokens,
+            num_tokens = runner.select_torchair_padded_batch_size(num_tokens)
+            if num_tokens_across_dp is not None:
+                num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
+                                                  num_tokens)
+
+        runner._dummy_run(num_tokens,
                           is_compile=False,
+                          num_tokens_across_dp=num_tokens_across_dp,
                           with_prefill=with_prefill)
 
     def _init_worker_distributed_environment(self) -> None: