fix: update dummy run batch size handling

jianzs · jianzs · commit 876de7169e24 · 2025-07-03T19:54:16.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1113,14 +1113,19 @@ def _process_reqs(
                               if self.dp_size > 1 else num_input_tokens)
             padded_batch_size = self.select_torchair_padded_batch_size(
                 max_num_tokens)
-            num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
-                                              padded_batch_size)
             graph_pad_size = padded_batch_size - total_num_scheduled_tokens
             extra_builder_kwargs['graph_pad_size'] = graph_pad_size
+            # If torchair graph is enabled and in decode mode, the dummy run
+            # batch size is set to the selected graph size.
+            dummy_num_tokens = padded_batch_size
         else:
             # If torchair graph is not enabled, or if with_prefill is True, the
             # dummy run batch size is set to 1.
-            num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1, 1)
+            dummy_num_tokens = 1
+
+        if self.dp_size > 1:
+            num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
+                                              dummy_num_tokens)
 
         if self.vllm_config.model_config.use_mla:
             attn_metadata = self.attn_metadata_builder.build(  # type: ignore