[Bugfix] graph batch size round up to tp size, when enable expert parallel

liziyu179 · liziyu179 · commit 4a614a15862f · 2025-07-03T15:22:26.000+08:00
Signed-off-by: liziyu &lt;liziyu16@huawei.com&gt;
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -811,8 +811,8 @@ def _process_reqs(
         assert total_num_scheduled_tokens > 0
         num_reqs = self.input_batch.num_reqs
         assert num_reqs > 0
-        if (self.use_aclgraph and
-                total_num_scheduled_tokens <= self.aclgraph_batch_sizes[-1]):
+        if (self.use_aclgraph and total_num_scheduled_tokens
+                <= self.aclgraph_batch_sizes[-1]):
             # Add padding to the batch size.
             num_input_tokens = self.vllm_config.pad_for_cudagraph(
                 total_num_scheduled_tokens)
@@ -2101,7 +2101,9 @@ def check_torchair_graph_batch_sizes(self):
         if self.parallel_config.enable_expert_parallel:
             new_graph_batch_sizes = []
             for graph_batch_size in self.torchair_graph_batch_sizes:
-                cur_graph_batch_size = graph_batch_size + tp_size - graph_batch_size % tp_size
-                if cur_graph_batch_size not in new_graph_batch_sizes:
+                cur_graph_batch_size = (graph_batch_size + tp_size -
+                                        1) // tp_size * tp_size
+                if cur_graph_batch_size not in new_graph_batch_sizes and \
+                    cur_graph_batch_size <= self.scheduler_config.max_num_batched_tokens:
                     new_graph_batch_sizes.append(cur_graph_batch_size)
             self.torchair_graph_batch_sizes = new_graph_batch_sizes