Skip to content

Commit 1a10561

Browse files
committed
fix: update dummy run batch size handling
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
1 parent 7420aca commit 1a10561

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,14 +1114,19 @@ def _process_reqs(
11141114
if self.dp_size > 1 else num_input_tokens)
11151115
padded_batch_size = self.select_torchair_padded_batch_size(
11161116
max_num_tokens)
1117-
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
1118-
padded_batch_size)
11191117
graph_pad_size = padded_batch_size - total_num_scheduled_tokens
11201118
extra_builder_kwargs['graph_pad_size'] = graph_pad_size
1119+
# If torchair graph is enabled and in decode mode, the dummy run
1120+
# batch size is set to the selected graph size.
1121+
dummy_num_tokens = padded_batch_size
11211122
else:
11221123
# If torchair graph is not enabled, or if with_prefill is True, the
11231124
# dummy run batch size is set to 1.
1124-
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1, 1)
1125+
dummy_num_tokens = 1
1126+
1127+
if self.dp_size > 1:
1128+
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
1129+
dummy_num_tokens)
11251130

11261131
if self.vllm_config.model_config.use_mla:
11271132
attn_metadata = self.attn_metadata_builder.build( # type: ignore

0 commit comments

Comments
 (0)