Skip to content

Commit e7064e0

Browse files
committed
fix: improve handling of max_num_tokens
Signed-off-by: Jade Zheng <zheng.shoujian@outlook.com>
1 parent b9c7f30 commit e7064e0

File tree

1 file changed

+10
-9
lines changed

1 file changed

+10
-9
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1116,17 +1116,18 @@ def _process_reqs(
11161116

11171117
# Add graph_pad_size here
11181118
if self.torchair_graph_enabled and not with_prefill:
1119-
if self.dp_size > 1:
1120-
padded_batch_size = self.select_torchair_padded_batch_size(
1121-
max_num_tokens)
1122-
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
1123-
padded_batch_size)
1124-
else:
1125-
padded_batch_size = self.select_torchair_padded_batch_size(
1126-
total_num_scheduled_tokens)
1119+
max_num_tokens = (max_num_tokens
1120+
if self.dp_size > 1 else num_input_tokens)
1121+
padded_batch_size = self.select_torchair_padded_batch_size(
1122+
max_num_tokens)
1123+
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1,
1124+
padded_batch_size)
11271125
graph_pad_size = padded_batch_size - total_num_scheduled_tokens
1128-
11291126
extra_builder_kwargs['graph_pad_size'] = graph_pad_size
1127+
else:
1128+
# If torchair graph is not enabled, or if with_prefill is True, the
1129+
# dummy run batch size is set to 1.
1130+
num_tokens_across_dp.masked_fill_(num_tokens_across_dp == -1, 1)
11301131

11311132
if self.vllm_config.model_config.use_mla:
11321133
attn_metadata = self.attn_metadata_builder.build( # type: ignore

0 commit comments

Comments
 (0)