address comments

congcongchen123 · congcongchen123 · commit c4d3f2ffb85c · 2025-07-09T23:38:29.000Z
Signed-off-by: Congcong Chen &lt;congcongchen@microsoft.com&gt;
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -30,8 +30,6 @@ class AttentionType:
     ENCODER_ONLY = "encoder_only"
     # Attention between dec. Q and enc. K/V for encoder-decoder
     ENCODER_DECODER = "encoder_decoder"
-    # Attention layer that reuse kv cache
-    DECODER_DECODER = "decoder_decoder"
 
 
 class AttentionBackend(ABC):
diff --git a/vllm/attention/backends/differential_flash_attn.py b/vllm/attention/backends/differential_flash_attn.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with FlashAttention."""
 from collections import defaultdict
 from dataclasses import dataclass
 from itertools import accumulate
@@ -55,7 +54,6 @@ def get_kv_cache_shape(
     ) -> Tuple[int, ...]:
         if block_size % 16 != 0:
             raise ValueError("Block size must be a multiple of 16.")
-        # return (2, num_blocks, block_size, num_kv_heads, head_size)
         return (2, 2, num_blocks, block_size, num_kv_heads // 2, head_size)
 
     @staticmethod
@@ -634,8 +632,9 @@ def __init__(
         self.differential_flash_attention_config = differential_flash_attention_config
         self.used_shared_kv_cache = self.differential_flash_attention_config.get(
             "used_shared_kv_cache", False)
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
+        # if kv_sharing_target_layer_name is not None:
+        #     raise NotImplementedError("KV sharing is not supported in V0.")
+        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
         if blocksparse_params is not None:
             raise ValueError(
                 "FlashAttention does not support block-sparse attention.")
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -160,9 +160,9 @@ def __init__(
         self.attn_type = attn_type
 
         if kv_sharing_target_layer_name is not None:
-            if not envs.VLLM_USE_V1:
-                raise NotImplementedError(
-                    "Cross-layer KV sharing is not supported in V0.")
+            # if not envs.VLLM_USE_V1:
+            #     raise NotImplementedError(
+            #         "Cross-layer KV sharing is not supported in V0.")
 
             validate_kv_sharing_target(
                 prefix,
diff --git a/vllm/model_executor/models/phi4flash.py b/vllm/model_executor/models/phi4flash.py
@@ -138,7 +138,13 @@ def __init__(self,
                 "subln": self.subln,
             }   
         }
-        
+
+        if yoco_cross:
+            kv_shared_layer_index = config.num_hidden_layers//2 + 1
+            kv_sharing_target_layer_name = f"model.layers.{kv_shared_layer_index}.self_attn.attn"  # noqa: E501
+        else:
+            kv_sharing_target_layer_name = None
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -147,7 +153,8 @@ def __init__(self,
             cache_config=cache_config,
             per_layer_sliding_window=sliding_window,
             prefix=f"{prefix}.attn",
-            attn_type=AttentionType.DECODER_DECODER if self.yoco_cross else AttentionType.DECODER,
+            attn_type=AttentionType.DECODER,
+            kv_sharing_target_layer_name=kv_sharing_target_layer_name,
             **params
         )
 
@@ -157,9 +164,6 @@ def lambda_init_fn(self, depth):
     def forward(
             self,
             hidden_states: torch.Tensor,
-            positions: torch.Tensor,
-            kv_cache: torch.Tensor,
-            attn_metadata: AttentionMetadata,
         ):
 
         if not self.yoco_cross: # need to generate kv-cache
@@ -168,9 +172,6 @@ def forward(
             attn_output = self.attn(q, k, v)
         else: # re-use the kv cache, full attention
             q = self.Wqkv(hidden_states)
-            virtual_engine = get_virtual_engine()
-            if self.attn.kv_cache[virtual_engine].numel() == 0:
-                 self.attn.kv_cache[virtual_engine] = kv_cache
             attn_output = self.attn(q, None, None)
         attn_output = attn_output.view(-1, self.num_heads * self.head_dim)
         return self.out_proj(attn_output)
@@ -417,15 +418,14 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         positions: torch.Tensor,
-        kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
         ssm_output: Optional[torch.LongTensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if self.use_mamba:
-            assert kv_cache is None and mamba_cache_params is not None
+            assert mamba_cache_params is not None
         else:
-            assert kv_cache is not None and mamba_cache_params is None
+            assert mamba_cache_params is None
 
         residual = hidden_states
         hidden_states = self.input_layernorm(hidden_states.to(dtype=self.input_layernorm.weight.dtype))
@@ -441,9 +441,6 @@ def forward(
         else:
             attn_outputs = self.attn(
                 hidden_states,
-                positions,
-                kv_cache,
-                attn_metadata,
             )
         hidden_states = residual + attn_outputs
         residual = hidden_states
@@ -452,12 +449,7 @@ def forward(
         hidden_states = residual + hidden_states
 
         return hidden_states, ssm_output
-
-def get_kv_cache(layer_name):
-    forward_context: ForwardContext = get_forward_context()
-    self = forward_context.no_compile_layers[layer_name]
-    kv_cache = self.kv_cache[forward_context.virtual_engine]
-    return kv_cache
+    
 
 class SambaYModel(nn.Module):
 
@@ -513,16 +505,16 @@ def forward(
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
 
-        kv_cache_idx = 0
         mamba_state_idx = 0
         ssm_output = None
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
             if i == self.config.num_hidden_layers // 2 + 2:
                 # profile run
+                kv_cache_idx = self.config.num_hidden_layers//2 + 1
                 cache_layer = self.layers[kv_cache_idx]
-                kv_cache = get_kv_cache(cache_layer.attn.attn.layer_name)
-                if kv_cache.numel() == 0:
+                kv_cache = cache_layer.attn.attn.kv_cache
+                if kv_cache[0].numel() == 0:
                     break
 
                 # Starting from this layer, we do not need to cuculate the kv cache since we reuse
@@ -546,31 +538,14 @@ def forward(
                 hidden_states, ssm_output = layer(
                     hidden_states,
                     positions,
-                    None, # kv_cache
                     attn_metadata,
                     mamba_cache,
                     ssm_output = ssm_output
                 )
             else:
-                if i < self.config.num_hidden_layers // 2:
-                    # sliding window attention
-                    cache_layer = self.layers[i]
-                    kv_cache = get_kv_cache(cache_layer.attn.attn.layer_name)
-                    kv_cache_idx = i
-                elif not layer.yoco_cross:
-                    # full attention that generates kv cache
-                    cache_layer = self.layers[i]
-                    kv_cache = get_kv_cache(cache_layer.attn.attn.layer_name)
-                    kv_cache_idx = i
-                else:
-                    # full attention that reuses kv cache
-                    cache_layer = self.layers[kv_cache_idx]
-                    kv_cache = get_kv_cache(cache_layer.attn.attn.layer_name)
-
                 hidden_states, ssm_output = layer(
                     hidden_states,
                     positions,
-                    kv_cache,
                     attn_metadata,
                     None, # mamba_cache_params
                     ssm_output = ssm_output
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -2881,6 +2881,7 @@ def get_mp_context():
 def bind_kv_cache(
         ctx: dict[str, Any],
         kv_cache: list[list[torch.Tensor]],  # [virtual_engine][layer_index]
+        shared_kv_cache_layers: dict[str, str],
 ) -> None:
     # Bind the kv_cache tensor to Attention modules, similar to
     # ctx[layer_name].kv_cache[ve]=kv_cache[ve][extract_layer_index(layer_name)]
@@ -2904,11 +2905,16 @@ def bind_kv_cache(
             extract_layer_index(layer_name)
             for layer_name in layer_need_kv_cache))
     for layer_name in layer_need_kv_cache:
+        target_layer_name = shared_kv_cache_layers[layer_name] if layer_name \
+            in shared_kv_cache_layers else layer_name
         kv_cache_idx = layer_index_sorted.index(
-            extract_layer_index(layer_name))
+            extract_layer_index(target_layer_name))
         forward_ctx = ctx[layer_name]
         assert len(forward_ctx.kv_cache) == len(kv_cache)
+        
         for ve, ve_kv_cache in enumerate(kv_cache):
+            assert kv_cache_idx < len(ve_kv_cache), \
+                "v0 doesn't support interleaving kv sharing, use v1 instead"
             forward_ctx.kv_cache[ve] = ve_kv_cache[kv_cache_idx]
 
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -9,7 +9,8 @@
 import torch.distributed
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.attention.layer import Attention
+from vllm.config import (VllmConfig, get_layers_from_vllm_config)
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
@@ -26,6 +27,7 @@
                            SequenceGroupMetadata, SequenceGroupMetadataDelta)
 from vllm.utils import (GiB_bytes, MemorySnapshot, bind_kv_cache,
                         memory_profiling)
+from vllm.v1.worker.utils import initialize_kv_cache_for_kv_sharing
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
 from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
@@ -345,8 +347,29 @@ def _init_cache_engine(self):
             self.cache_engine[ve].gpu_cache
             for ve in range(self.parallel_config.pipeline_parallel_size)
         ]
+
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        shared_kv_cache_layers: dict[str, str] = {}
+
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+
+        for layer_name, attn_module in attn_layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                shared_kv_cache_layers[layer_name] = kv_tgt_layer
+        
         bind_kv_cache(self.compilation_config.static_forward_context,
-                      self.gpu_cache)
+                      self.gpu_cache, shared_kv_cache_layers)
 
     def _warm_up_model(self) -> None:
         # warm up sizes that are not in cudagraph capture sizes,