42
42
from vllm .inputs import INPUT_REGISTRY
43
43
from vllm .logger import logger
44
44
from vllm .model_executor .layers .fused_moe import FusedMoE
45
- from vllm .model_executor .layers .rotary_embedding import MRotaryEmbedding
46
45
from vllm .model_executor .model_loader import get_model
47
- from vllm .multimodal import MULTIMODAL_REGISTRY
48
- from vllm .multimodal .inputs import MultiModalKwargs , PlaceholderRange
49
- from vllm .multimodal .utils import group_mm_inputs_by_modality
46
+ from vllm .multimodal import MULTIMODAL_REGISTRY , MultiModalKwargs
50
47
from vllm .sampling_params import SamplingType
51
48
from vllm .sequence import IntermediateTensors
52
49
from vllm .utils import (STR_DTYPE_TO_TORCH_DTYPE , DeviceMemoryProfiler ,
64
61
from vllm .v1 .utils import bind_kv_cache
65
62
from vllm .v1 .worker .gpu_input_batch import CachedRequestState , InputBatch
66
63
from vllm .v1 .worker .lora_model_runner_mixin import LoRAModelRunnerMixin
67
- from vllm .v1 .worker .utils import (gather_mm_placeholders ,
68
- sanity_check_mm_encoder_outputs ,
69
- scatter_mm_placeholders )
70
64
71
65
from vllm_ascend .ascend_config import get_ascend_config
72
66
from vllm_ascend .attention .attention import AttentionMaskBuilder
@@ -373,7 +367,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
373
367
# Remove finished requests from the cached states.
374
368
for req_id in scheduler_output .finished_req_ids :
375
369
self .requests .pop (req_id , None )
376
- self .encoder_cache .pop (req_id , None )
377
370
# Remove the finished requests from the persistent batch.
378
371
# NOTE(woosuk): There could be an edge case where finished_req_ids and
379
372
# scheduled_req_ids overlap. This happens when a request is aborted and
@@ -386,14 +379,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
386
379
if req_index is not None :
387
380
removed_req_indices .append (req_index )
388
381
389
- # Free the cached encoder outputs.
390
- for req_id , input_id in scheduler_output .free_encoder_input_ids :
391
- encoder_outputs = self .encoder_cache .get (req_id )
392
- if encoder_outputs is not None :
393
- encoder_outputs .pop (input_id , None )
394
- if not encoder_outputs :
395
- self .encoder_cache .pop (req_id , None )
396
-
397
382
# Remove the unscheduled requests from the persistent batch.
398
383
# NOTE(woosuk): The unscheduled requests are either preempted requests
399
384
# or running requests that are not scheduled in this step. We remove
@@ -435,43 +420,6 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
435
420
lora_request = new_req_data .lora_request ,
436
421
)
437
422
438
- # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
439
- if self .uses_mrope :
440
- image_grid_thw = []
441
- video_grid_thw = []
442
- second_per_grid_ts = []
443
- audio_feature_lengths = []
444
- use_audio_in_video = False
445
- for mm_input in self .requests [req_id ].mm_inputs :
446
- if mm_input .get ("image_grid_thw" ) is not None :
447
- image_grid_thw .extend (
448
- mm_input ["image_grid_thw" ].tolist ())
449
- if mm_input .get ("video_grid_thw" ) is not None :
450
- video_grid_thw .extend (
451
- mm_input ["video_grid_thw" ].tolist ())
452
- if mm_input .get ("second_per_grid_ts" ) is not None :
453
- second_per_grid_ts .extend (
454
- mm_input ["second_per_grid_ts" ])
455
- if mm_input .get ("audio_feature_lengths" ) is not None :
456
- audio_feature_lengths .extend (
457
- mm_input ["audio_feature_lengths" ])
458
- if mm_input .get ("use_audio_in_video" ) is True :
459
- use_audio_in_video = True
460
-
461
- hf_config = self .model_config .hf_config
462
-
463
- self .requests [req_id ].mrope_positions , \
464
- self .requests [req_id ].mrope_position_delta = \
465
- MRotaryEmbedding .get_input_positions_tensor (
466
- self .requests [req_id ].prompt_token_ids ,
467
- hf_config = hf_config ,
468
- image_grid_thw = image_grid_thw ,
469
- video_grid_thw = video_grid_thw ,
470
- second_per_grid_ts = second_per_grid_ts ,
471
- audio_feature_lengths = audio_feature_lengths ,
472
- use_audio_in_video = use_audio_in_video ,
473
- )
474
-
475
423
req_ids_to_add .append (req_id )
476
424
477
425
# Update the states of the running/resumed requests.
@@ -596,166 +544,6 @@ def _make_attention_mask(self, seq_lens, query_lens, position,
596
544
else :
597
545
return None
598
546
599
- def _calc_mrope_positions (self , scheduler_output : "SchedulerOutput" ):
600
- mrope_pos_ptr = 0
601
- for index , req_id in enumerate (self .input_batch .req_ids ):
602
- req = self .requests [req_id ]
603
- assert req .mrope_positions is not None
604
-
605
- num_computed_tokens = \
606
- self .input_batch .num_computed_tokens_cpu [index ]
607
- num_scheduled_tokens = \
608
- scheduler_output .num_scheduled_tokens [req_id ]
609
- num_prompt_tokens = len (req .prompt_token_ids )
610
-
611
- if num_computed_tokens + num_scheduled_tokens > num_prompt_tokens :
612
- prompt_part_len = max (0 ,
613
- num_prompt_tokens - num_computed_tokens )
614
- completion_part_len = max (
615
- 0 , num_scheduled_tokens - prompt_part_len )
616
- else :
617
- prompt_part_len = num_scheduled_tokens
618
- completion_part_len = 0
619
-
620
- assert num_scheduled_tokens == prompt_part_len + completion_part_len
621
-
622
- if prompt_part_len > 0 :
623
- # prompt's mrope_positions are pre-computed
624
- dst_start = mrope_pos_ptr
625
- dst_end = mrope_pos_ptr + prompt_part_len
626
- src_start = num_computed_tokens
627
- src_end = num_computed_tokens + prompt_part_len
628
-
629
- self .mrope_positions_cpu [:, dst_start :dst_end ] = \
630
- req .mrope_positions [:,src_start :src_end ]
631
-
632
- mrope_pos_ptr += prompt_part_len
633
-
634
- if completion_part_len > 0 :
635
- # compute completion's mrope_positions on-the-fly
636
- dst_start = mrope_pos_ptr
637
- dst_end = mrope_pos_ptr + completion_part_len
638
-
639
- self .mrope_positions_cpu [:, dst_start :dst_end ] = \
640
- MRotaryEmbedding .get_next_input_positions_tensor (
641
- req .mrope_position_delta ,
642
- context_len = num_computed_tokens +
643
- prompt_part_len ,
644
- seq_len = num_computed_tokens +
645
- prompt_part_len +
646
- completion_part_len ,
647
- )
648
-
649
- mrope_pos_ptr += completion_part_len
650
-
651
- def _execute_mm_encoder (self , scheduler_output : "SchedulerOutput" ):
652
- scheduled_encoder_inputs = scheduler_output .scheduled_encoder_inputs
653
- if not scheduled_encoder_inputs :
654
- return
655
-
656
- # Batch the multi-modal inputs.
657
- mm_inputs = list [MultiModalKwargs ]()
658
- req_ids_pos = list [tuple [str , int , PlaceholderRange ]]()
659
- for req_id , encoder_input_ids in scheduled_encoder_inputs .items ():
660
- req_state = self .requests [req_id ]
661
-
662
- for mm_input_id in encoder_input_ids :
663
- mm_inputs .append (req_state .mm_inputs [mm_input_id ])
664
- req_ids_pos .append (
665
- (req_id , mm_input_id , req_state .mm_positions [mm_input_id ]))
666
-
667
- # Batch mm inputs as much as we can: if a request in the batch has
668
- # multiple modalities or a different modality than the previous one,
669
- # we process it separately to preserve item order.
670
- # FIXME(ywang96): This is a hacky way to deal with multiple modalities
671
- # in the same batch while still being able to benefit from batching
672
- # multimodal inputs. The proper solution should be reordering the
673
- # encoder outputs.
674
- grouped_mm_inputs_list = group_mm_inputs_by_modality (mm_inputs )
675
-
676
- encoder_outputs = []
677
- for grouped_mm_inputs in grouped_mm_inputs_list :
678
- batched_mm_inputs = MultiModalKwargs .batch (grouped_mm_inputs )
679
- batched_mm_inputs = MultiModalKwargs .as_kwargs (batched_mm_inputs ,
680
- device = self .device )
681
-
682
- # Run the encoder.
683
- # `curr_group_outputs` is either of the following:
684
- # 1. A tensor of shape (num_items, feature_size, hidden_size)
685
- # in case feature_size is fixed across all multimodal items.
686
- # 2. A list or tuple (length: num_items) of tensors, each of shape
687
- # (feature_size, hidden_size) in case the feature size is dynamic
688
- # depending on the input multimodal items.
689
- curr_group_outputs = self .model .get_multimodal_embeddings (
690
- ** batched_mm_inputs )
691
-
692
- sanity_check_mm_encoder_outputs (
693
- curr_group_outputs ,
694
- expected_num_items = len (grouped_mm_inputs ),
695
- )
696
-
697
- for output in curr_group_outputs :
698
- encoder_outputs .append (output )
699
-
700
- # Cache the encoder outputs.
701
- for (req_id , input_id , pos_info ), output in zip (
702
- req_ids_pos ,
703
- encoder_outputs ,
704
- ):
705
- if req_id not in self .encoder_cache :
706
- self .encoder_cache [req_id ] = {}
707
-
708
- self .encoder_cache [req_id ][input_id ] = scatter_mm_placeholders (
709
- output ,
710
- is_embed = pos_info .is_embed ,
711
- )
712
-
713
- def _gather_mm_embeddings (
714
- self ,
715
- scheduler_output : "SchedulerOutput" ,
716
- ) -> list [torch .Tensor ]:
717
- mm_embeds : list [torch .Tensor ] = []
718
- for req_id in self .input_batch .req_ids :
719
- num_scheduled_tokens = scheduler_output .num_scheduled_tokens [
720
- req_id ]
721
- req_state = self .requests [req_id ]
722
- num_computed_tokens = req_state .num_computed_tokens
723
- mm_positions = req_state .mm_positions
724
- for i , pos_info in enumerate (mm_positions ):
725
- start_pos = pos_info .offset
726
- num_encoder_tokens = pos_info .length
727
-
728
- # The encoder output is needed if the two ranges overlap:
729
- # [num_computed_tokens,
730
- # num_computed_tokens + num_scheduled_tokens) and
731
- # [start_pos, start_pos + num_encoder_tokens)
732
- if start_pos >= num_computed_tokens + num_scheduled_tokens :
733
- # The encoder output is not needed in this step.
734
- break
735
- if start_pos + num_encoder_tokens <= num_computed_tokens :
736
- # The encoder output is already processed and stored
737
- # in the decoder's KV cache.
738
- continue
739
-
740
- start_idx = max (num_computed_tokens - start_pos , 0 )
741
- end_idx = min (
742
- num_computed_tokens - start_pos + num_scheduled_tokens ,
743
- num_encoder_tokens )
744
- assert start_idx < end_idx
745
- assert req_id in self .encoder_cache
746
- assert i in self .encoder_cache [req_id ]
747
- encoder_output = self .encoder_cache [req_id ][i ]
748
-
749
- if (is_embed := pos_info .is_embed ) is not None :
750
- is_embed = is_embed [start_idx :end_idx ]
751
-
752
- mm_embeds_item = gather_mm_placeholders (
753
- encoder_output [start_idx :end_idx ],
754
- is_embed = is_embed ,
755
- )
756
- mm_embeds .append (mm_embeds_item )
757
- return mm_embeds
758
-
759
547
def _process_reqs (
760
548
self ,
761
549
scheduler_output : "SchedulerOutput" ,
@@ -818,17 +606,6 @@ def _process_reqs(
818
606
arange ,
819
607
out = positions_np )
820
608
821
- # Calculate M-RoPE positions.
822
- # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
823
- if self .uses_mrope :
824
- self ._calc_mrope_positions (scheduler_output )
825
-
826
- if self .uses_mrope :
827
- # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
828
- self .mrope_positions [:, :total_num_scheduled_tokens ].copy_ (
829
- self .mrope_positions_cpu [:, :total_num_scheduled_tokens ],
830
- non_blocking = True )
831
-
832
609
self .positions [:total_num_scheduled_tokens ].copy_ (
833
610
self .positions_cpu [:total_num_scheduled_tokens ], non_blocking = True )
834
611
positions = self .positions [:num_input_tokens ]
@@ -943,43 +720,6 @@ def _process_reqs(
943
720
self .input_ids_cpu [:total_num_scheduled_tokens ], non_blocking = True )
944
721
input_ids = self .input_ids [:num_input_tokens ]
945
722
946
- # prepare the MRoPE for mllm if using multimodal
947
- num_input_tokens = total_num_scheduled_tokens
948
- # _prepare_inputs may reorder the batch, so we must gather multi
949
- # modal outputs after that to ensure the correct order
950
- if self .is_multimodal_model :
951
- # Run the multimodal encoder if any.
952
- self ._execute_mm_encoder (scheduler_output )
953
- mm_embeds = self ._gather_mm_embeddings (scheduler_output )
954
- else :
955
- mm_embeds = []
956
-
957
- if self .is_multimodal_model :
958
- # NOTE(woosuk): To unify token ids and soft tokens (vision
959
- # embeddings), we always use embeddings (rather than token ids)
960
- # as input to the multimodal model, even when the input is text.
961
- input_ids = self .input_ids [:num_input_tokens ]
962
- if mm_embeds :
963
- inputs_embeds = self .model .get_input_embeddings (
964
- input_ids , mm_embeds )
965
- else :
966
- inputs_embeds = self .model .get_input_embeddings (input_ids )
967
- # TODO(woosuk): Avoid the copy. Optimize.
968
- self .inputs_embeds [:num_input_tokens ].copy_ (inputs_embeds )
969
- inputs_embeds = self .inputs_embeds [:num_input_tokens ]
970
- input_ids = None
971
- else :
972
- # For text-only models, we use token ids as input.
973
- # While it is possible to use embeddings as input just like the
974
- # multimodal models, it is not desirable for performance since
975
- # then the embedding layer is not included in the CUDA graph.
976
- input_ids = self .input_ids [:num_input_tokens ]
977
- inputs_embeds = None
978
- if self .uses_mrope :
979
- positions = self .mrope_positions [:, :num_input_tokens ]
980
- else :
981
- positions = self .positions [:num_input_tokens ]
982
-
983
723
if (envs_ascend .VLLM_ENABLE_MC2
984
724
or self .torchair_graph_enabled ) and not with_prefill :
985
725
input_ids = self .input_ids [:padded_batch_size ]
@@ -1001,7 +741,7 @@ def _process_reqs(
1001
741
input_ids = input_ids ,
1002
742
positions = positions ,
1003
743
intermediate_tensors = intermediate_tensors ,
1004
- inputs_embeds = inputs_embeds ,
744
+ inputs_embeds = None ,
1005
745
** model_kwargs ,
1006
746
)
1007
747
else :
@@ -1010,7 +750,7 @@ def _process_reqs(
1010
750
input_ids = input_ids ,
1011
751
positions = positions ,
1012
752
intermediate_tensors = intermediate_tensors ,
1013
- inputs_embeds = inputs_embeds ,
753
+ inputs_embeds = None ,
1014
754
** model_kwargs ,
1015
755
)
1016
756
@@ -1493,11 +1233,8 @@ def _dummy_run(
1493
1233
return hidden_states
1494
1234
1495
1235
def profile_run (self ) -> None :
1496
- # FIXME Profile with multimodal encoder & encoder cache.
1497
- # current _profile_multimodal() using PyTorch SDPA backend method not
1498
- # support for window/full attn to reduce Memcpy operations, so will cause
1499
- # Out Of Memory problem, so we currently don't use self._profile_multimodal()
1500
- # self._profile_multimodal()
1236
+ # Profile with multimodal encoder & encoder cache.
1237
+ self ._profile_multimodal ()
1501
1238
1502
1239
# For profile, have maximum num_reqs and that collectively have
1503
1240
# maximum num_tokens.
0 commit comments