address comment

congcongchen123 · congcongchen123 · commit 11e30fe2b4f8 · 2025-07-11T09:07:19.000Z
Signed-off-by: Congcong Chen &lt;congcongchen@microsoft.com&gt;
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -246,6 +246,8 @@ def check_available_online(
     "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
                                             trust_remote_code=True,
                                             v0_only=True),
+    "Phi4FlashForCausalLM": _HfExamplesInfo("microsoft/Phi-4-mini-flash-reasoning", # noqa: E501
+                                        trust_remote_code=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
@@ -86,6 +86,9 @@ def _initialize_kv_caches_v1(self, vllm_config):
                        _initialize_kv_caches_v1), monkeypatch.context() as m):
         if model_info.v0_only:
             m.setenv("VLLM_USE_V1", "0")
+        if model_arch == "Phi4FlashForCausalLM":
+            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+            m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -458,6 +458,31 @@ def test_bind_kv_cache():
     assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
     assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
 
+def test_bind_kv_cache_kv_sharing():
+    from vllm.attention import Attention
+
+    ctx = {
+        'layers.0.self_attn': Attention(32, 128, 0.1),
+        'layers.1.self_attn': Attention(32, 128, 0.1),
+        'layers.2.self_attn': Attention(32, 128, 0.1),
+        'layers.3.self_attn': Attention(32, 128, 0.1),
+    }
+    kv_cache = [
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+        torch.zeros((1, )),
+    ]
+    shared_kv_cache_layers = {
+        'layers.2.self_attn': 'layers.1.self_attn',
+        'layers.3.self_attn': 'layers.0.self_attn'
+    }
+    bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
+    assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
+    assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
+    assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
+
 def test_bind_kv_cache_non_attention():
     from vllm.attention import Attention
 
diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
@@ -308,7 +308,8 @@ def __init__(
         kv_sharing_target_layer_name: Optional[str] = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "BLOCK_SPARSE_FLASH_ATTN Backend.")
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
@@ -676,8 +676,6 @@ def __init__(
         self.used_shared_kv_cache = \
             self.differential_flash_attention_config.get(
                 "used_shared_kv_cache", False)
-        # if kv_sharing_target_layer_name is not None:
-        #     raise NotImplementedError("KV sharing is not supported in V0.")
         self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
         if blocksparse_params is not None:
             raise ValueError(
diff --git a/vllm/attention/backends/dual_chunk_flash_attn.py b/vllm/attention/backends/dual_chunk_flash_attn.py
@@ -295,7 +295,8 @@ def __init__(
         dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "DUAL_CHUNK_FLASH_ATTN backend.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -622,7 +622,8 @@ def __init__(
         use_irope: bool = False,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "FLASH_ATTN backend.")
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
@@ -938,7 +938,8 @@ def __init__(
         use_irope: bool = False,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "FLASHINFER backend.")
         if use_irope:
             logger.warning_once(
                 "Using irope in FlashInfer is not supported yet, it will fall"
diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py
@@ -115,7 +115,8 @@ def __init__(
     ) -> None:
         super(AttentionImpl, self).__init__()
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "HPU_ATTN backend.")
         if use_irope:
             logger.warning_once(
                 "Using irope in HPU is not supported yet, it will fall back "
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -499,7 +499,8 @@ def __init__(
         use_irope: bool = False,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "ROCM_FLASH backend.")
         if use_irope:
             logger.warning_once(
                 "Using irope in ROCm Flash Attention is not supported yet, it "
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
@@ -394,7 +394,8 @@ def __init__(
         use_irope: bool = False,
     ) -> None:
         if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+            raise NotImplementedError("KV sharing is not supported in V0 "
+                                      "XFORMERS backend.")
         if blocksparse_params is not None:
             raise ValueError(
                 "XFormers does not support block-sparse attention.")
diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
@@ -9,6 +9,7 @@
 from transformers.activations import ACT2FN
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention.selector import _Backend
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.forward_context import ForwardContext, get_forward_context
@@ -173,6 +174,8 @@ def __init__(self,
             attn_type=AttentionType.DECODER,
             kv_sharing_target_layer_name=kv_sharing_target_layer_name,
             **params)
+        assert self.attn.backend == _Backend.DIFFERENTIAL_FLASH_ATTN,\
+              "DIFFERENTIAL_FLASH_ATTN required"
 
     def lambda_init_fn(self, depth):
         return 0.8 - 0.6 * math.exp(-0.3 * depth)
@@ -433,8 +436,6 @@ def __init__(
 
         self.yoco_mb = False
         self.yoco_cross = False
-        assert config.num_hidden_layers % 4 == 0, \
-            'n_layer should be divisible by 4 for SambaY + yoco'
         if layer_idx >= config.num_hidden_layers // 2:
             self.yoco_mb = True
             self.yoco_cross = (layer_idx
@@ -608,11 +609,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         scheduler_config = vllm_config.scheduler_config
         self.compilation_config = vllm_config.compilation_config
         self.vllm_config = vllm_config
-        # Prefix caching is not supported since there are mamba layers in this
-        # mode.
+        # Prefix caching and chunked prefill is not supported for this model.
         assert not cache_config.enable_prefix_caching, \
             "Phi4flash currently does not support prefix caching"
-
+        assert not scheduler_config.chunked_prefill_enabled, \
+            "Phi4Flash currently does not support prefix caching"
         super().__init__()
         self.config = config
         self.model_config = vllm_config.model_config
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -2881,7 +2881,7 @@ def get_mp_context():
 def bind_kv_cache(
     ctx: dict[str, Any],
     kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
-    shared_kv_cache_layers: dict[str, str],
+    shared_kv_cache_layers: Optional[dict[str, str]] = None
 ) -> None:
     # Bind the kv_cache tensor to Attention modules, similar to
     # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
@@ -2893,6 +2893,10 @@ def bind_kv_cache(
     #    attention of the same layer (e.g., bart's decoder.layers.1.self_attn
     #    and decoder.layers.1.encoder_attn) is mapped to the same kv cache
     #    tensor
+    # 5. Some models have attention layers that share kv cache with previous
+    #    layers, this is specified through shared_kv_cache_layers
+    if shared_kv_cache_layers is None:
+        shared_kv_cache_layers = {}
     from vllm.attention import AttentionType
     from vllm.model_executor.models.utils import extract_layer_index
     layer_need_kv_cache = [
@@ -2904,16 +2908,19 @@ def bind_kv_cache(
         set(
             extract_layer_index(layer_name)
             for layer_name in layer_need_kv_cache))
+
     for layer_name in layer_need_kv_cache:
+        # 1. Get the kv_cache_idx of the target_layer_name.
         target_layer_name = shared_kv_cache_layers.get(layer_name, layer_name)
         kv_cache_idx = layer_index_sorted.index(
             extract_layer_index(target_layer_name))
+
+        # 2. Bind kv_cache to forward_ctx.
         forward_ctx = ctx[layer_name]
         assert len(forward_ctx.kv_cache) == len(kv_cache)
-
         for ve, ve_kv_cache in enumerate(kv_cache):
             assert kv_cache_idx < len(ve_kv_cache), \
-                "v0 doesn't support interleaving kv sharing, use v1 instead"
+                "v0 doesn't support interleaving kv sharing"
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]