vllm-project
diff --git a/‎tests/v1/kv_connector/__init__.py b/‎tests/v1/kv_connector/__init__.py
diff --git a/‎tests/v1/kv_connector/unit/test_multi_connector.py
Lines changed: 15 additions & 72 deletions b/‎tests/v1/kv_connector/unit/test_multi_connector.py
Lines changed: 15 additions & 72 deletions
diff --git a/‎tests/v1/kv_connector/unit/utils.py
Lines changed: 62 additions & 0 deletions b/‎tests/v1/kv_connector/unit/utils.py
Lines changed: 62 additions & 0 deletions
diff --git a/‎vllm/attention/layer.py
Lines changed: 1 addition & 1 deletion b/‎vllm/attention/layer.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/attention/utils/kv_sharing_utils.py
Lines changed: 33 additions & 0 deletions b/‎vllm/attention/utils/kv_sharing_utils.py
Lines changed: 33 additions & 0 deletions
diff --git a/‎vllm/logger.py
Lines changed: 14 additions & 0 deletions b/‎vllm/logger.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎vllm/v1/attention/backends/flashinfer.py
Lines changed: 5 additions & 68 deletions b/‎vllm/v1/attention/backends/flashinfer.py
Lines changed: 5 additions & 68 deletions
@@ -3,16 +3,10 @@
 import filecmp
 import shutil
 import tempfile
-from collections import defaultdict
 from pathlib import Path
 
 from vllm import LLM, SamplingParams
-from vllm.config import KVTransferConfig, VllmConfig
-from vllm.distributed.kv_transfer.kv_connector.factory import (
-    KVConnectorFactory)
-from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
-    SharedStorageConnector)
-from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+from vllm.config import KVTransferConfig
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
 
@@ -25,65 +19,6 @@
 SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20)
 
 
-class TestSharedStorageConnector(SharedStorageConnector):
-
-    def __init__(self, config: VllmConfig, role):
-        self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
-        self._connector = SharedStorageConnector(config, role)
-        self.call_record: dict[str, int] = defaultdict(int)
-        # Use a unique temp file per connector
-        self._event_file = tempfile.gettempdir(
-        ) + f"/connector_{self.name}-{self.role.name}_events.log"
-        # Start with an empty file
-        with open(self._event_file, "w") as _:
-            pass
-
-    def __getattribute__(self, name):
-        if name in ("_connector", "call_record", "name", "_event_file",
-                    "__class__", "__dict__", "__getattribute__",
-                    "__init__"):  # avoid recursion
-            return object.__getattribute__(self, name)
-        if not hasattr(self._connector, name):
-            return object.__getattribute__(self, name)
-        attr = getattr(self._connector, name)
-
-        # Intercept calls to the connector interface and write an event
-        # for each one to a file, which can be read back in the main test proc.
-        if callable(attr):
-
-            def wrapper(*args, **kwargs):
-                self.call_record[name] += 1
-
-                # Include args that we're interested in
-                to_log = [name]
-                for arg in args:
-                    if isinstance(arg, int):
-                        to_log.append(str(arg))
-                    elif isinstance(arg, KVCacheBlocks):
-                        to_log.append(
-                            f"num_blocks={[len(b) for b in arg.blocks]}")
-
-                # Log the event as a line to the file
-                try:
-                    with open(self._event_file, "a") as f:
-                        f.write(' '.join(to_log) + "\n")
-                except Exception as e:
-                    print(f"[ERROR] Could not log event {name} "
-                          f"for {self.name}: {e}")
-                return attr(*args, **kwargs)
-
-            return wrapper
-        return attr
-
-
-# This relies on "fork" multiprocessing method being used.
-# It's the default but vLLM may fall back to spawn if for example CUDA
-# is already initialized.
-KVConnectorFactory.register_connector("TestSharedStorageConnector",
-                                      TestSharedStorageConnector.__module__,
-                                      TestSharedStorageConnector.__name__)
-
-
 # Helper function to compare directories recursively
 def _compare_directories(dir1: Path, dir2: Path) -> bool:
     """Compares two directories recursively for identical content."""
@@ -118,19 +53,27 @@ def test_multi_shared_storage_connector_consistency():
         kv_role="kv_both",
         kv_connector_extra_config={
             "connectors": [{
-                "kv_connector": "TestSharedStorageConnector",
-                "kv_role": "kv_both",
+                "kv_connector":
+                "TestSharedStorageConnector",
+                "kv_role":
+                "kv_both",
                 "kv_connector_extra_config": {
                     "shared_storage_path": str(storage_1_path),
                     "name": "storage1",
-                }
+                },
+                "kv_connector_module_path":
+                "tests.v1.kv_connector.unit.utils",
             }, {
-                "kv_connector": "TestSharedStorageConnector",
-                "kv_role": "kv_both",
+                "kv_connector":
+                "TestSharedStorageConnector",
+                "kv_role":
+                "kv_both",
                 "kv_connector_extra_config": {
                     "shared_storage_path": str(storage_2_path),
                     "name": "storage2",
-                }
+                },
+                "kv_connector_module_path":
+                "tests.v1.kv_connector.unit.utils",
             }]
         },
     )
 
@@ -1,12 +1,19 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import tempfile
+from collections import defaultdict
 from typing import Any, Optional
 
 import torch
 
 from vllm import SamplingParams
 from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
                          ModelConfig, SchedulerConfig, VllmConfig)
+from vllm.distributed.kv_transfer.kv_connector.factory import (
+    KVConnectorFactory)
+from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
+    SharedStorageConnector)
+from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.sched.scheduler import Scheduler
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheGroupSpec)
@@ -187,3 +194,58 @@ def create_model_runner_output(
         finished_sending=finished_sending,
         finished_recving=finished_recving,
     )
+
+
+class TestSharedStorageConnector(SharedStorageConnector):
+
+    def __init__(self, config: VllmConfig, role):
+        self.name = config.kv_transfer_config.kv_connector_extra_config["name"]
+        self._connector = SharedStorageConnector(config, role)
+        self.call_record: dict[str, int] = defaultdict(int)
+        # Use a unique temp file per connector
+        self._event_file = tempfile.gettempdir(
+        ) + f"/connector_{self.name}-{self.role.name}_events.log"
+        # Start with an empty file
+        with open(self._event_file, "w") as _:
+            pass
+
+    def __getattribute__(self, name):
+        if name in ("_connector", "call_record", "name", "_event_file",
+                    "__class__", "__dict__", "__getattribute__",
+                    "__init__"):  # avoid recursion
+            return object.__getattribute__(self, name)
+        if not hasattr(self._connector, name):
+            return object.__getattribute__(self, name)
+        attr = getattr(self._connector, name)
+
+        # Intercept calls to the connector interface and write an event
+        # for each one to a file, which can be read back in the main test proc.
+        if callable(attr):
+
+            def wrapper(*args, **kwargs):
+                self.call_record[name] += 1
+
+                # Include args that we're interested in
+                to_log = [name]
+                for arg in args:
+                    if isinstance(arg, int):
+                        to_log.append(str(arg))
+                    elif isinstance(arg, KVCacheBlocks):
+                        to_log.append(
+                            f"num_blocks={[len(b) for b in arg.blocks]}")
+
+                # Log the event as a line to the file
+                try:
+                    with open(self._event_file, "a") as f:
+                        f.write(' '.join(to_log) + "\n")
+                except Exception as e:
+                    print(f"[ERROR] Could not log event {name} "
+                          f"for {self.name}: {e}")
+                return attr(*args, **kwargs)
+
+            return wrapper
+        return attr
+
+
+KVConnectorFactory.register_connector("TestSharedStorageConnector", __name__,
+                                      TestSharedStorageConnector.__name__)
@@ -10,6 +10,7 @@
 import vllm.envs as envs
 from vllm.attention import AttentionType
 from vllm.attention.selector import backend_name_to_enum, get_attn_backend
+from vllm.attention.utils.kv_sharing_utils import validate_kv_sharing_target
 from vllm.config import CacheConfig, get_current_vllm_config
 from vllm.distributed.kv_transfer import (get_kv_transfer_group,
                                           has_kv_transfer_group,
@@ -21,7 +22,6 @@
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
-from vllm.v1.attention.backends.utils import validate_kv_sharing_target
 
 
 class Attention(nn.Module):
 
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+def validate_kv_sharing_target(current_layer_name, target_layer_name,
+                               static_forward_context):
+    error_msg = (f"Specified KV sharing target layer for {current_layer_name} "
+                 f"is not valid: target layer {target_layer_name} ")
+
+    if current_layer_name == target_layer_name:
+        raise ValueError(error_msg +
+                         "cannot be the same as the current layer.")
+
+    if target_layer_name not in static_forward_context:
+        from vllm.model_executor.models.utils import extract_layer_index
+
+        # If target layer name is not in the static fwd context, it means either
+        # a) the target layer does not come BEFORE the current layer, or
+        # b) the target layer is not an Attention layer that exists in the model
+        current_layer_idx = extract_layer_index(current_layer_name)
+        target_layer_idx = extract_layer_index(target_layer_name)
+        if current_layer_idx <= target_layer_idx:
+            raise ValueError(error_msg + "must come before the current layer.")
+        else:
+            raise ValueError(error_msg +
+                             "is not a valid Attention layer in the model.")
+
+    # Currently KV sharing is only supported between layers of the same type
+    target_layer_attn_type = static_forward_context[
+        target_layer_name].attn_type
+    expected = static_forward_context[current_layer_name].attn_type
+    if target_layer_attn_type != expected:
+        raise ValueError(
+            error_msg +
+            f"must be the same type as the current layer ({expected}).")
@@ -53,6 +53,12 @@
 }
 
 
+@lru_cache
+def _print_debug_once(logger: Logger, msg: str, *args: Hashable) -> None:
+    # Set the stacklevel to 2 to print the original caller's line info
+    logger.debug(msg, *args, stacklevel=2)
+
+
 @lru_cache
 def _print_info_once(logger: Logger, msg: str, *args: Hashable) -> None:
     # Set the stacklevel to 2 to print the original caller's line info
@@ -74,6 +80,13 @@ class _VllmLogger(Logger):
         `intel_extension_for_pytorch.utils._logger`.
     """
 
+    def debug_once(self, msg: str, *args: Hashable) -> None:
+        """
+        As [`debug`][logging.Logger.debug], but subsequent calls with
+        the same message are silently dropped.
+        """
+        _print_debug_once(self, msg, *args)
+
     def info_once(self, msg: str, *args: Hashable) -> None:
         """
         As [`info`][logging.Logger.info], but subsequent calls with
@@ -132,6 +145,7 @@ def init_logger(name: str) -> _VllmLogger:
     logger = logging.getLogger(name)
 
     methods_to_patch = {
+        "debug_once": _print_debug_once,
         "info_once": _print_info_once,
         "warning_once": _print_warning_once,
     }
 
@@ -14,13 +14,14 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
-from vllm.attention.layer import Attention
-from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
                                               CommonAttentionMetadata,
-                                              get_kv_cache_layout)
+                                              PerLayerParameters,
+                                              get_kv_cache_layout,
+                                              get_per_layer_parameters,
+                                              infer_global_hyperparameters)
 from vllm.v1.kv_cache_interface import AttentionSpec
 from vllm.v1.worker.block_table import BlockTable
 
@@ -93,70 +94,6 @@ def get_kv_cache_stride_order() -> tuple[int, ...]:
         return stride_order
 
 
-@dataclass
-class PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters.
-    """
-
-    window_left: int
-    logits_soft_cap: Optional[float]
-    sm_scale: float
-
-
-def get_per_layer_parameters(
-        vllm_config: VllmConfig) -> dict[str, PerLayerParameters]:
-    """
-    Scan all attention layers and determine some hyperparameters
-    to use during `plan`.
-    """
-
-    layers = get_layers_from_vllm_config(vllm_config, Attention)
-    per_layer_params: dict[str, PerLayerParameters] = {}
-
-    for key, layer in layers.items():
-        impl = layer.impl
-        assert isinstance(impl, FlashInferImpl)
-
-        # Infer hyperparameters from the attention layer
-        window_size = impl.sliding_window
-        window_left = window_size[0] if window_size is not None else -1
-        logits_soft_cap = impl.logits_soft_cap
-        sm_scale = impl.scale
-
-        per_layer_params[key] = PerLayerParameters(window_left,
-                                                   logits_soft_cap, sm_scale)
-
-    return per_layer_params
-
-
-def infer_global_hyperparameters(
-        per_layer_params: dict[str, PerLayerParameters]) -> PerLayerParameters:
-    """
-    Currently, FlashInfer backend only support models in which all layers share
-    the same values for the following hyperparameters:
-    - `window_left`
-    - `logits_soft_cap`
-    - `sm_scale`
-
-    So this function asserts that all layers share the same values for these
-    hyperparameters and returns the global values.
-    """
-
-    assert len(per_layer_params) > 0, "No attention layers found in the model."
-
-    param_sets = list(per_layer_params.values())
-    global_params = param_sets[0]
-    for params in param_sets:
-        assert params == global_params, (
-            "FlashInfer backend currently only supports models in which all "
-            "layers share the same values for the following hyperparameters: "
-            "`window_left`, `logits_soft_cap`, `sm_scale`.")
-
-    return global_params
-
-
 @dataclass
 class FlashInferMetadata:
 
@@ -336,7 +273,7 @@ def _get_cascade_wrapper(self):
     def _plan(self, attn_metadata: FlashInferMetadata):
         if self.global_hyperparameters is None:
             self.global_hyperparameters = infer_global_hyperparameters(
-                get_per_layer_parameters(self.vllm_config))
+                get_per_layer_parameters(self.vllm_config, FlashInferImpl))
         if attn_metadata.use_cascade:
             attn_metadata.cascade_wrapper = self._get_cascade_wrapper()
             attn_metadata.cascade_wrapper.plan(