File tree Expand file tree Collapse file tree 2 files changed +4
-2
lines changed
vllm/model_executor/models Expand file tree Collapse file tree 2 files changed +4
-2
lines changed Original file line number Diff line number Diff line change @@ -247,7 +247,8 @@ def check_available_online(
247
247
trust_remote_code = True ,
248
248
v0_only = True ),
249
249
"Phi4FlashForCausalLM" : _HfExamplesInfo ("microsoft/Phi-4-mini-flash-reasoning" , # noqa: E501
250
- trust_remote_code = True ),
250
+ trust_remote_code = True ,
251
+ v0_only = True ),
251
252
"PhiMoEForCausalLM" : _HfExamplesInfo ("microsoft/Phi-3.5-MoE-instruct" ,
252
253
trust_remote_code = True ),
253
254
"Plamo2ForCausalLM" : _HfExamplesInfo ("pfnet/plamo-2-1b" ,
Original file line number Diff line number Diff line change 8
8
import torch .nn as nn
9
9
from transformers .activations import ACT2FN
10
10
11
+ import vllm .envs as envs
11
12
from vllm .attention import Attention , AttentionMetadata , AttentionType
12
13
from vllm .attention .selector import _Backend
13
14
from vllm .config import CacheConfig , VllmConfig
@@ -563,7 +564,7 @@ def forward(
563
564
# the kv cache since we reuse the kv cache from last layer.
564
565
# If in prefill phase, we can <s>prune></s> truncate
565
566
# the hidden state to save computation cost.
566
- if attn_metadata .prefill_metadata :
567
+ if attn_metadata .prefill_metadata and not envs . VLLM_USE_V1 :
567
568
selected_token_indices = torch .cumsum (
568
569
attn_metadata .seq_lens_tensor , dim = 0 ) - 1
569
570
hidden_states = hidden_states .index_select (
You can’t perform that action at this time.
0 commit comments