Skip to content

Commit 73979f5

Browse files
authored
[Fix] Revert 20dedb to fix ACL Graph (#1205)
### What this PR does / why we need it? This PR revert 20dedb to restore the tokenwise padding logics so that ACL Graph can work as expected. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed --------- Signed-off-by: Yizhou Liu <liu_yizhou@outlook.com>
1 parent 89a388b commit 73979f5

File tree

2 files changed

+7
-268
lines changed

2 files changed

+7
-268
lines changed

tests/singlecard/test_offline_inference.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ def test_quantization_models(model: str, max_tokens: int) -> None:
8585

8686

8787
@pytest.mark.parametrize("model", MULTIMODALITY_MODELS)
88+
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
89+
reason="qwen2.5_vl is not supported on v1")
8890
def test_multimodal(model, prompt_template, vllm_runner):
8991
image = ImageAsset("cherry_blossom") \
9092
.pil_image.convert("RGB")

vllm_ascend/worker/model_runner_v1.py

Lines changed: 5 additions & 268 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,8 @@
4242
from vllm.inputs import INPUT_REGISTRY
4343
from vllm.logger import logger
4444
from vllm.model_executor.layers.fused_moe import FusedMoE
45-
from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
4645
from vllm.model_executor.model_loader import get_model
47-
from vllm.multimodal import MULTIMODAL_REGISTRY
48-
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
49-
from vllm.multimodal.utils import group_mm_inputs_by_modality
46+
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
5047
from vllm.sampling_params import SamplingType
5148
from vllm.sequence import IntermediateTensors
5249
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
@@ -64,9 +61,6 @@
6461
from vllm.v1.utils import bind_kv_cache
6562
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
6663
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
67-
from vllm.v1.worker.utils import (gather_mm_placeholders,
68-
sanity_check_mm_encoder_outputs,
69-
scatter_mm_placeholders)
7064

7165
from vllm_ascend.ascend_config import get_ascend_config
7266
from vllm_ascend.attention.attention import AttentionMaskBuilder
@@ -373,7 +367,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
373367
# Remove finished requests from the cached states.
374368
for req_id in scheduler_output.finished_req_ids:
375369
self.requests.pop(req_id, None)
376-
self.encoder_cache.pop(req_id, None)
377370
# Remove the finished requests from the persistent batch.
378371
# NOTE(woosuk): There could be an edge case where finished_req_ids and
379372
# scheduled_req_ids overlap. This happens when a request is aborted and
@@ -386,14 +379,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
386379
if req_index is not None:
387380
removed_req_indices.append(req_index)
388381

389-
# Free the cached encoder outputs.
390-
for req_id, input_id in scheduler_output.free_encoder_input_ids:
391-
encoder_outputs = self.encoder_cache.get(req_id)
392-
if encoder_outputs is not None:
393-
encoder_outputs.pop(input_id, None)
394-
if not encoder_outputs:
395-
self.encoder_cache.pop(req_id, None)
396-
397382
# Remove the unscheduled requests from the persistent batch.
398383
# NOTE(woosuk): The unscheduled requests are either preempted requests
399384
# or running requests that are not scheduled in this step. We remove
@@ -435,43 +420,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
435420
lora_request=new_req_data.lora_request,
436421
)
437422

438-
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
439-
if self.uses_mrope:
440-
image_grid_thw = []
441-
video_grid_thw = []
442-
second_per_grid_ts = []
443-
audio_feature_lengths = []
444-
use_audio_in_video = False
445-
for mm_input in self.requests[req_id].mm_inputs:
446-
if mm_input.get("image_grid_thw") is not None:
447-
image_grid_thw.extend(
448-
mm_input["image_grid_thw"].tolist())
449-
if mm_input.get("video_grid_thw") is not None:
450-
video_grid_thw.extend(
451-
mm_input["video_grid_thw"].tolist())
452-
if mm_input.get("second_per_grid_ts") is not None:
453-
second_per_grid_ts.extend(
454-
mm_input["second_per_grid_ts"])
455-
if mm_input.get("audio_feature_lengths") is not None:
456-
audio_feature_lengths.extend(
457-
mm_input["audio_feature_lengths"])
458-
if mm_input.get("use_audio_in_video") is True:
459-
use_audio_in_video = True
460-
461-
hf_config = self.model_config.hf_config
462-
463-
self.requests[req_id].mrope_positions, \
464-
self.requests[req_id].mrope_position_delta = \
465-
MRotaryEmbedding.get_input_positions_tensor(
466-
self.requests[req_id].prompt_token_ids,
467-
hf_config=hf_config,
468-
image_grid_thw=image_grid_thw,
469-
video_grid_thw=video_grid_thw,
470-
second_per_grid_ts=second_per_grid_ts,
471-
audio_feature_lengths=audio_feature_lengths,
472-
use_audio_in_video=use_audio_in_video,
473-
)
474-
475423
req_ids_to_add.append(req_id)
476424

477425
# Update the states of the running/resumed requests.
@@ -596,166 +544,6 @@ def _make_attention_mask(self, seq_lens, query_lens, position,
596544
else:
597545
return None
598546

599-
def _calc_mrope_positions(self, scheduler_output: "SchedulerOutput"):
600-
mrope_pos_ptr = 0
601-
for index, req_id in enumerate(self.input_batch.req_ids):
602-
req = self.requests[req_id]
603-
assert req.mrope_positions is not None
604-
605-
num_computed_tokens = \
606-
self.input_batch.num_computed_tokens_cpu[index]
607-
num_scheduled_tokens = \
608-
scheduler_output.num_scheduled_tokens[req_id]
609-
num_prompt_tokens = len(req.prompt_token_ids)
610-
611-
if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens:
612-
prompt_part_len = max(0,
613-
num_prompt_tokens - num_computed_tokens)
614-
completion_part_len = max(
615-
0, num_scheduled_tokens - prompt_part_len)
616-
else:
617-
prompt_part_len = num_scheduled_tokens
618-
completion_part_len = 0
619-
620-
assert num_scheduled_tokens == prompt_part_len + completion_part_len
621-
622-
if prompt_part_len > 0:
623-
# prompt's mrope_positions are pre-computed
624-
dst_start = mrope_pos_ptr
625-
dst_end = mrope_pos_ptr + prompt_part_len
626-
src_start = num_computed_tokens
627-
src_end = num_computed_tokens + prompt_part_len
628-
629-
self.mrope_positions_cpu[:, dst_start:dst_end] = \
630-
req.mrope_positions[:,src_start:src_end]
631-
632-
mrope_pos_ptr += prompt_part_len
633-
634-
if completion_part_len > 0:
635-
# compute completion's mrope_positions on-the-fly
636-
dst_start = mrope_pos_ptr
637-
dst_end = mrope_pos_ptr + completion_part_len
638-
639-
self.mrope_positions_cpu[:, dst_start:dst_end] = \
640-
MRotaryEmbedding.get_next_input_positions_tensor(
641-
req.mrope_position_delta,
642-
context_len=num_computed_tokens +
643-
prompt_part_len,
644-
seq_len=num_computed_tokens +
645-
prompt_part_len +
646-
completion_part_len,
647-
)
648-
649-
mrope_pos_ptr += completion_part_len
650-
651-
def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
652-
scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
653-
if not scheduled_encoder_inputs:
654-
return
655-
656-
# Batch the multi-modal inputs.
657-
mm_inputs = list[MultiModalKwargs]()
658-
req_ids_pos = list[tuple[str, int, PlaceholderRange]]()
659-
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
660-
req_state = self.requests[req_id]
661-
662-
for mm_input_id in encoder_input_ids:
663-
mm_inputs.append(req_state.mm_inputs[mm_input_id])
664-
req_ids_pos.append(
665-
(req_id, mm_input_id, req_state.mm_positions[mm_input_id]))
666-
667-
# Batch mm inputs as much as we can: if a request in the batch has
668-
# multiple modalities or a different modality than the previous one,
669-
# we process it separately to preserve item order.
670-
# FIXME(ywang96): This is a hacky way to deal with multiple modalities
671-
# in the same batch while still being able to benefit from batching
672-
# multimodal inputs. The proper solution should be reordering the
673-
# encoder outputs.
674-
grouped_mm_inputs_list = group_mm_inputs_by_modality(mm_inputs)
675-
676-
encoder_outputs = []
677-
for grouped_mm_inputs in grouped_mm_inputs_list:
678-
batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
679-
batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
680-
device=self.device)
681-
682-
# Run the encoder.
683-
# `curr_group_outputs` is either of the following:
684-
# 1. A tensor of shape (num_items, feature_size, hidden_size)
685-
# in case feature_size is fixed across all multimodal items.
686-
# 2. A list or tuple (length: num_items) of tensors, each of shape
687-
# (feature_size, hidden_size) in case the feature size is dynamic
688-
# depending on the input multimodal items.
689-
curr_group_outputs = self.model.get_multimodal_embeddings(
690-
**batched_mm_inputs)
691-
692-
sanity_check_mm_encoder_outputs(
693-
curr_group_outputs,
694-
expected_num_items=len(grouped_mm_inputs),
695-
)
696-
697-
for output in curr_group_outputs:
698-
encoder_outputs.append(output)
699-
700-
# Cache the encoder outputs.
701-
for (req_id, input_id, pos_info), output in zip(
702-
req_ids_pos,
703-
encoder_outputs,
704-
):
705-
if req_id not in self.encoder_cache:
706-
self.encoder_cache[req_id] = {}
707-
708-
self.encoder_cache[req_id][input_id] = scatter_mm_placeholders(
709-
output,
710-
is_embed=pos_info.is_embed,
711-
)
712-
713-
def _gather_mm_embeddings(
714-
self,
715-
scheduler_output: "SchedulerOutput",
716-
) -> list[torch.Tensor]:
717-
mm_embeds: list[torch.Tensor] = []
718-
for req_id in self.input_batch.req_ids:
719-
num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
720-
req_id]
721-
req_state = self.requests[req_id]
722-
num_computed_tokens = req_state.num_computed_tokens
723-
mm_positions = req_state.mm_positions
724-
for i, pos_info in enumerate(mm_positions):
725-
start_pos = pos_info.offset
726-
num_encoder_tokens = pos_info.length
727-
728-
# The encoder output is needed if the two ranges overlap:
729-
# [num_computed_tokens,
730-
# num_computed_tokens + num_scheduled_tokens) and
731-
# [start_pos, start_pos + num_encoder_tokens)
732-
if start_pos >= num_computed_tokens + num_scheduled_tokens:
733-
# The encoder output is not needed in this step.
734-
break
735-
if start_pos + num_encoder_tokens <= num_computed_tokens:
736-
# The encoder output is already processed and stored
737-
# in the decoder's KV cache.
738-
continue
739-
740-
start_idx = max(num_computed_tokens - start_pos, 0)
741-
end_idx = min(
742-
num_computed_tokens - start_pos + num_scheduled_tokens,
743-
num_encoder_tokens)
744-
assert start_idx < end_idx
745-
assert req_id in self.encoder_cache
746-
assert i in self.encoder_cache[req_id]
747-
encoder_output = self.encoder_cache[req_id][i]
748-
749-
if (is_embed := pos_info.is_embed) is not None:
750-
is_embed = is_embed[start_idx:end_idx]
751-
752-
mm_embeds_item = gather_mm_placeholders(
753-
encoder_output[start_idx:end_idx],
754-
is_embed=is_embed,
755-
)
756-
mm_embeds.append(mm_embeds_item)
757-
return mm_embeds
758-
759547
def _process_reqs(
760548
self,
761549
scheduler_output: "SchedulerOutput",
@@ -818,17 +606,6 @@ def _process_reqs(
818606
arange,
819607
out=positions_np)
820608

821-
# Calculate M-RoPE positions.
822-
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
823-
if self.uses_mrope:
824-
self._calc_mrope_positions(scheduler_output)
825-
826-
if self.uses_mrope:
827-
# Only relevant for models using M-RoPE (e.g, Qwen2-VL)
828-
self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
829-
self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
830-
non_blocking=True)
831-
832609
self.positions[:total_num_scheduled_tokens].copy_(
833610
self.positions_cpu[:total_num_scheduled_tokens], non_blocking=True)
834611
positions = self.positions[:num_input_tokens]
@@ -943,43 +720,6 @@ def _process_reqs(
943720
self.input_ids_cpu[:total_num_scheduled_tokens], non_blocking=True)
944721
input_ids = self.input_ids[:num_input_tokens]
945722

946-
# prepare the MRoPE for mllm if using multimodal
947-
num_input_tokens = total_num_scheduled_tokens
948-
# _prepare_inputs may reorder the batch, so we must gather multi
949-
# modal outputs after that to ensure the correct order
950-
if self.is_multimodal_model:
951-
# Run the multimodal encoder if any.
952-
self._execute_mm_encoder(scheduler_output)
953-
mm_embeds = self._gather_mm_embeddings(scheduler_output)
954-
else:
955-
mm_embeds = []
956-
957-
if self.is_multimodal_model:
958-
# NOTE(woosuk): To unify token ids and soft tokens (vision
959-
# embeddings), we always use embeddings (rather than token ids)
960-
# as input to the multimodal model, even when the input is text.
961-
input_ids = self.input_ids[:num_input_tokens]
962-
if mm_embeds:
963-
inputs_embeds = self.model.get_input_embeddings(
964-
input_ids, mm_embeds)
965-
else:
966-
inputs_embeds = self.model.get_input_embeddings(input_ids)
967-
# TODO(woosuk): Avoid the copy. Optimize.
968-
self.inputs_embeds[:num_input_tokens].copy_(inputs_embeds)
969-
inputs_embeds = self.inputs_embeds[:num_input_tokens]
970-
input_ids = None
971-
else:
972-
# For text-only models, we use token ids as input.
973-
# While it is possible to use embeddings as input just like the
974-
# multimodal models, it is not desirable for performance since
975-
# then the embedding layer is not included in the CUDA graph.
976-
input_ids = self.input_ids[:num_input_tokens]
977-
inputs_embeds = None
978-
if self.uses_mrope:
979-
positions = self.mrope_positions[:, :num_input_tokens]
980-
else:
981-
positions = self.positions[:num_input_tokens]
982-
983723
if (envs_ascend.VLLM_ENABLE_MC2
984724
or self.torchair_graph_enabled) and not with_prefill:
985725
input_ids = self.input_ids[:padded_batch_size]
@@ -1001,7 +741,7 @@ def _process_reqs(
1001741
input_ids=input_ids,
1002742
positions=positions,
1003743
intermediate_tensors=intermediate_tensors,
1004-
inputs_embeds=inputs_embeds,
744+
inputs_embeds=None,
1005745
**model_kwargs,
1006746
)
1007747
else:
@@ -1010,7 +750,7 @@ def _process_reqs(
1010750
input_ids=input_ids,
1011751
positions=positions,
1012752
intermediate_tensors=intermediate_tensors,
1013-
inputs_embeds=inputs_embeds,
753+
inputs_embeds=None,
1014754
**model_kwargs,
1015755
)
1016756

@@ -1493,11 +1233,8 @@ def _dummy_run(
14931233
return hidden_states
14941234

14951235
def profile_run(self) -> None:
1496-
# FIXME Profile with multimodal encoder & encoder cache.
1497-
# current _profile_multimodal() using PyTorch SDPA backend method not
1498-
# support for window/full attn to reduce Memcpy operations, so will cause
1499-
# Out Of Memory problem, so we currently don't use self._profile_multimodal()
1500-
# self._profile_multimodal()
1236+
# Profile with multimodal encoder & encoder cache.
1237+
self._profile_multimodal()
15011238

15021239
# For profile, have maximum num_reqs and that collectively have
15031240
# maximum num_tokens.

0 commit comments

Comments
 (0)