Skip to content

Commit d425f5b

Browse files
committed
fix: update dummy run batch size handling
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
1 parent e7064e0 commit d425f5b

File tree

1 file changed

+8
-3
lines changed

1 file changed

+8
-3
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1120,14 +1120,19 @@ def _process_reqs(
11201120
if self.dp_size > 1 else num_input_tokens)
11211121
padded_batch_size = self.select_torchair_padded_batch_size(
11221122
max_num_tokens)
1123-
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
1124-
padded_batch_size)
11251123
graph_pad_size = padded_batch_size - total_num_scheduled_tokens
11261124
extra_builder_kwargs['graph_pad_size'] = graph_pad_size
1125+
# If torchair graph is enabled and in decode mode, the dummy run
1126+
# batch size is set to the selected graph size.
1127+
dummy_num_tokens = padded_batch_size
11271128
else:
11281129
# If torchair graph is not enabled, or if with_prefill is True, the
11291130
# dummy run batch size is set to 1.
1130-
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1, 1)
1131+
dummy_num_tokens = 1
1132+
1133+
if self.dp_size > 1:
1134+
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
1135+
dummy_num_tokens)
11311136

11321137
if self.vllm_config.model_config.use_mla:
11331138
attn_metadata = self.attn_metadata_builder.build( # type: ignore

0 commit comments

Comments
 (0)