@@ -956,10 +956,7 @@ def _process_reqs(
956
956
# Copy the tensors to the NPU.
957
957
self .input_ids [:total_num_scheduled_tokens ].copy_ (
958
958
self .input_ids_cpu [:total_num_scheduled_tokens ], non_blocking = True )
959
- input_ids = self .input_ids [:num_input_tokens ]
960
959
961
- # prepare the MRoPE for mllm if using multimodal
962
- num_input_tokens = total_num_scheduled_tokens
963
960
# _prepare_inputs may reorder the batch, so we must gather multi
964
961
# modal outputs after that to ensure the correct order
965
962
if self .is_multimodal_model :
@@ -973,27 +970,26 @@ def _process_reqs(
973
970
# NOTE(woosuk): To unify token ids and soft tokens (vision
974
971
# embeddings), we always use embeddings (rather than token ids)
975
972
# as input to the multimodal model, even when the input is text.
976
- input_ids = self .input_ids [:num_input_tokens ]
973
+ input_ids = self .input_ids [:total_num_scheduled_tokens ]
977
974
if mm_embeds :
978
975
inputs_embeds = self .model .get_input_embeddings (
979
976
input_ids , mm_embeds )
980
977
else :
981
978
inputs_embeds = self .model .get_input_embeddings (input_ids )
982
979
# TODO(woosuk): Avoid the copy. Optimize.
983
- self .inputs_embeds [:num_input_tokens ].copy_ (inputs_embeds )
980
+ self .inputs_embeds [:total_num_scheduled_tokens ].copy_ (
981
+ inputs_embeds )
984
982
inputs_embeds = self .inputs_embeds [:num_input_tokens ]
985
983
input_ids = None
986
984
else :
987
985
# For text-only models, we use token ids as input.
988
986
# While it is possible to use embeddings as input just like the
989
987
# multimodal models, it is not desirable for performance since
990
- # then the embedding layer is not included in the CUDA graph.
988
+ # then the embedding layer is not included in the ACL graph.
991
989
input_ids = self .input_ids [:num_input_tokens ]
992
990
inputs_embeds = None
993
991
if self .uses_mrope :
994
992
positions = self .mrope_positions [:, :num_input_tokens ]
995
- else :
996
- positions = self .positions [:num_input_tokens ]
997
993
998
994
if self .torchair_graph_enabled and not with_prefill :
999
995
input_ids = self .input_ids [:padded_batch_size ]
0 commit comments