Skip to content

Commit 2c7dd85

Browse files
authored
[Fix] Fix the token-wise padding mechanism (vllm-project#1300)
### What this PR does / why we need it? Fix the token-wise padding mechanism. Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
1 parent b350eda commit 2c7dd85

File tree

1 file changed

+4
-8
lines changed

1 file changed

+4
-8
lines changed

vllm_ascend/worker/model_runner_v1.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -956,10 +956,7 @@ def _process_reqs(
956956
# Copy the tensors to the NPU.
957957
self.input_ids[:total_num_scheduled_tokens].copy_(
958958
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
959-
input_ids = self.input_ids[:num_input_tokens]
960959

961-
# prepare the MRoPE for mllm if using multimodal
962-
num_input_tokens = total_num_scheduled_tokens
963960
# _prepare_inputs may reorder the batch, so we must gather multi
964961
# modal outputs after that to ensure the correct order
965962
if self.is_multimodal_model:
@@ -973,27 +970,26 @@ def _process_reqs(
973970
# NOTE(woosuk): To unify token ids and soft tokens (vision
974971
# embeddings), we always use embeddings (rather than token ids)
975972
# as input to the multimodal model, even when the input is text.
976-
input_ids = self.input_ids[:num_input_tokens]
973+
input_ids = self.input_ids[:total_num_scheduled_tokens]
977974
if mm_embeds:
978975
inputs_embeds = self.model.get_input_embeddings(
979976
input_ids, mm_embeds)
980977
else:
981978
inputs_embeds = self.model.get_input_embeddings(input_ids)
982979
# TODO(woosuk): Avoid the copy. Optimize.
983-
self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
980+
self.inputs_embeds[:total_num_scheduled_tokens].copy_(
981+
inputs_embeds)
984982
inputs_embeds = self.inputs_embeds[:num_input_tokens]
985983
input_ids = None
986984
else:
987985
# For text-only models, we use token ids as input.
988986
# While it is possible to use embeddings as input just like the
989987
# multimodal models, it is not desirable for performance since
990-
# then the embedding layer is not included in the CUDA graph.
988+
# then the embedding layer is not included in the ACL graph.
991989
input_ids = self.input_ids[:num_input_tokens]
992990
inputs_embeds = None
993991
if self.uses_mrope:
994992
positions = self.mrope_positions[:, :num_input_tokens]
995-
else:
996-
positions = self.positions[:num_input_tokens]
997993

998994
if self.torchair_graph_enabled and not with_prefill:
999995
input_ids = self.input_ids[:padded_batch_size]

0 commit comments

Comments
 (0)