address comments

congcongchen123 · congcongchen123 · commit 9ae5d95339d5 · 2025-07-11T09:23:08.000Z
Signed-off-by: Congcong Chen &lt;congcongchen@microsoft.com&gt;
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -247,7 +247,8 @@ def check_available_online(
                                             trust_remote_code=True,
                                             v0_only=True),
     "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501
-                                        trust_remote_code=True),
+                                        trust_remote_code=True,
+                                        v0_only=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
@@ -8,6 +8,7 @@
 import torch.nn as nn
 from transformers.activations import ACT2FN
 
+import vllm.envs as envs
 from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.selector import _Backend
 from vllm.config import CacheConfig, VllmConfig
@@ -563,7 +564,7 @@ def forward(
                 # the kv cache since we reuse the kv cache from last layer.
                 # If in prefill phase, we can <s>prune></s> truncate
                 # the hidden state to save computation cost.
-                if attn_metadata.prefill_metadata:
+                if attn_metadata.prefill_metadata and not envs.VLLM_USE_V1:
                     selected_token_indices = torch.cumsum(
                         attn_metadata.seq_lens_tensor, dim=0) - 1
                     hidden_states = hidden_states.index_select(