fix: improve handling of max_num_tokens

jianzs · jianzs · commit adfa9e6c19db · 2025-07-02T22:41:53.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1102,17 +1102,18 @@ def _process_reqs(
 
         # Add graph_pad_size here
         if self.torchair_graph_enabled and not with_prefill:
-            if self.dp_size > 1:
-                padded_batch_size = self.select_torchair_padded_batch_size(
-                    max_num_tokens)
-                num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
-                                                  padded_batch_size)
-            else:
-                padded_batch_size = self.select_torchair_padded_batch_size(
-                    total_num_scheduled_tokens)
+            max_num_tokens = (max_num_tokens
+                              if self.dp_size > 1 else num_input_tokens)
+            padded_batch_size = self.select_torchair_padded_batch_size(
+                max_num_tokens)
+            num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
+                                              padded_batch_size)
             graph_pad_size = padded_batch_size - total_num_scheduled_tokens
-
             extra_builder_kwargs['graph_pad_size'] = graph_pad_size
+        else:
+            # If torchair graph is not enabled, or if with_prefill is True, the
+            # dummy run batch size is set to 1.
+            num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1, 1)
 
         if self.vllm_config.model_config.use_mla:
             attn_metadata = self.attn_metadata_builder.build(  # type: ignore