fix: correct handling the num_tokens for dummy run

jianzs · jianzs · commit d81acdfed9d3 · 2025-07-03T19:54:16.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -247,26 +247,25 @@ def pin_lora(self, lora_id: int) -> bool:
 
     def execute_dummy_batch(self) -> None:
         runner = self.model_runner
+        assert runner.dp_size > 1, "Dummy batch execution should only be " \
+            "performed with data parallelism enabled, but got " \
+            f"dp_size={runner.dp_size}."
 
         # If torchair graph is enabled, notify the other DP ranks that this is a
         # dummy run by using '-1' as a flag for num_tokens. This will be
         # replaced with the final determined graph size before the forward pass.
-        num_tokens = (-1 if runner.torchair_graph_enabled and not with_prefill 
-                      else 1)
-        num_tokens_across_dp = None
-        with_prefill = False
-
-        if runner.dp_size > 1:
-            num_tokens_across_dp, with_prefill = \
-                runner._get_forward_metadata_across_dp(num_tokens, with_prefill)
-            num_tokens = int(num_tokens_across_dp.max().item())
+        num_tokens_across_dp, with_prefill = \
+            runner._get_forward_metadata_across_dp(-1, False)
 
         if runner.torchair_graph_enabled and not with_prefill:
-            num_tokens = runner.select_torchair_padded_batch_size(num_tokens)
-            if num_tokens_across_dp is not None:
-                num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
-                                                  num_tokens)
+            max_num_tokens = int(num_tokens_across_dp.max().item())
+            num_tokens = runner.select_torchair_padded_batch_size(
+                max_num_tokens)
+        else:
+            num_tokens = 1
 
+        num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
+                                          num_tokens)
         runner._dummy_run(num_tokens,
                           is_compile=False,
                           num_tokens_across_dp=num_tokens_across_dp,