cleanup

LucasWilkinson · LucasWilkinson · commit 643fb3fa9ca1 · 2025-07-07T04:23:48.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -14,7 +14,8 @@
 import vllm.envs as envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionType)
-from vllm.config import VllmConfig
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.flash_attn import use_cascade_attention
 from vllm.v1.attention.backends.utils import (
@@ -23,7 +24,6 @@
 from vllm.v1.kv_cache_interface import AttentionSpec
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
 
@@ -97,27 +97,21 @@ def get_per_layer_parameters(
     Scan all attention layers and determine some hyperparameters
     to use during `plan`.
     """
-    model_config = vllm_config.model_config
+    layers = get_layers_from_vllm_config(vllm_config, Attention)
+    per_layer_params: dict[str, PerLayerParameters] = {}
 
-    # This is a workaround for the fact that the attention backend
-    # in this standalone test does not have access to the full model,
-    # so we mock the layer access.
-    if not hasattr(model_config, 'get_num_layers'):
-        raise RuntimeError(
-            "The model config does not have a get_num_layers method. "
-            "This is required for the FlashInfer backend.")
+    for key, layer in layers.items():
+        impl = layer.impl
+        assert isinstance(impl, FlashInferImpl)
 
-    per_layer_params: dict[str, PerLayerParameters] = {}
-    for i in range(model_config.get_num_layers()):
-        sliding_window = model_config.get_sliding_window_for_layer(i)
-        logits_soft_cap = model_config.get_logits_soft_cap_for_layer(i)
-        sm_scale = model_config.get_sm_scale_for_layer(i)
-
-        per_layer_params[str(i)] = PerLayerParameters(
-            sm_scale=sm_scale,
-            logits_soft_cap=logits_soft_cap,
-            window_left=sliding_window if sliding_window is not None else -1,
-        )
+        # Infer hyperparameters from the attention layer
+        window_size = impl.sliding_window
+        window_left = window_size[0] if window_size is not None else -1
+        logits_soft_cap = impl.logits_soft_cap
+        sm_scale = impl.scale
+
+        per_layer_params[key] = PerLayerParameters(window_left,
+                                                   logits_soft_cap, sm_scale)
 
     return per_layer_params
 
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
@@ -12,12 +12,10 @@
 from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
                                               CommonAttentionMetadata)
 from vllm.v1.kv_cache_interface import AttentionSpec, MambaSpec
-from vllm.v1.worker.block_table import BlockTable
 
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
-    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 
 def get_mamba2_chunk_size(vllm_config: VllmConfig) -> int:
@@ -58,13 +56,11 @@ class Mamba2AttentionMetadata:
 class Mamba2AttentionMetadataBuilder(
         AttentionMetadataBuilder[Mamba2AttentionMetadata]):
 
-    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
-                 block_table: BlockTable):
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
         assert isinstance(kv_cache_spec, MambaSpec)
-        self.runner = runner
         self.kv_cache_spec = kv_cache_spec
-        self.block_table = block_table
-        self.chunk_size = get_mamba2_chunk_size(runner.vllm_config)
+        self.chunk_size = get_mamba2_chunk_size(vllm_config)
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -143,15 +139,14 @@ def build(self,
         has_initial_states = None
         prep_initial_states = False
 
-        state_indices_tensor = self.block_table.block_table[:num_reqs, 0]
+        state_indices_tensor = common_attn_metadata.block_table_tensor[:, 0]
 
         # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
         if self._num_prefills > 0:
             #[batch,]
             has_initial_states_cpu = (
-                self.runner.input_batch.
-                num_computed_tokens_cpu_tensor[num_reqs -
-                                               self._num_prefills:num_reqs]
+                common_attn_metadata.
+                num_computed_tokens_cpu[num_reqs - self._num_prefills:num_reqs]
                 > 0)
             prep_initial_states = torch.any(has_initial_states_cpu).item()
             has_initial_states = has_initial_states_cpu.to(
diff --git a/vllm/v1/attention/backends/rocm_aiter_fa.py b/vllm/v1/attention/backends/rocm_aiter_fa.py
@@ -1,26 +1,21 @@
 # SPDX-License-Identifier: Apache-2.0
 """Attention layer with AiterFlashAttention."""
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Optional
+from typing import Any, Optional
 
 import torch
 
 from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType,
                                               is_quantized_kv_cache)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.attention.backends.flash_attn import (
     make_local_attention_virtual_batches)
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.block_table import BlockTable
-
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 if current_platform.is_rocm():
     import aiter
@@ -171,56 +166,49 @@ def flash_attn_varlen_func_fake(
 
 class AiterFlashAttentionMetadataBuilder:
 
-    def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec,
-                 block_table: BlockTable):
-        model_config = runner.model_config
-
-        self.runner = runner
-        self.num_heads_q = model_config.get_num_attention_heads(
-            runner.parallel_config)
-        self.num_heads_kv = model_config.get_num_kv_heads(
-            runner.parallel_config)
-        self.headdim = model_config.get_head_size()
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        self.parallel_config = vllm_config.parallel_config
+        self.cache_config = vllm_config.cache_config
+        self.device = device
+
+        self.num_heads_q = self.model_config.get_num_attention_heads(
+            self.parallel_config)
+        self.num_heads_kv = self.model_config.get_num_kv_heads(
+            self.parallel_config)
+        self.headdim = self.model_config.get_head_size()
         self.block_size = kv_cache_spec.block_size
         self.kv_cache_spec = kv_cache_spec
-        self.block_table = block_table
 
         # Sliding window size to be used with the AOT scheduler will be
         # populated on first build() call.
         self.aot_sliding_window: Optional[tuple[int, int]] = None
 
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
+    def reorder_batch(self, input_batch, scheduler_output) -> bool:
         return False
 
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> 'AiterFlashAttentionMetadata':
 
-        num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len
 
-        max_seq_len = int(self.runner.seq_lens_np[:num_reqs].max())
-        total_tokens = int(self.runner.seq_lens_np[:num_reqs].sum())
+        max_seq_len = int(common_attn_metadata.seq_lens_cpu.max().item())
+        total_tokens = int(common_attn_metadata.seq_lens_cpu.sum().item())
         query_start_loc = common_attn_metadata.query_start_loc
+        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
         seq_lens = common_attn_metadata.seq_lens
-        block_table = self.block_table
-        block_table_tensor = block_table.get_device_tensor()[:num_reqs]
-
-        block_table.slot_mapping[:num_actual_tokens].copy_(
-            block_table.slot_mapping_cpu[:num_actual_tokens],
-            non_blocking=True)
-        # Fill unused with -1. Needed for reshape_and_cache in full cuda graph
-        # mode.
-        block_table.slot_mapping[num_actual_tokens:].fill_(-1)
-
-        slot_mapping = block_table.slot_mapping[:num_actual_tokens]
+        seq_lens_cpu = common_attn_metadata.seq_lens_cpu
+        block_table_tensor = common_attn_metadata.block_table_tensor
+        slot_mapping = common_attn_metadata.slot_mapping
 
         cu_seq_lens = torch.zeros(seq_lens.shape[0] + 1,
                                   dtype=torch.int32,
-                                  device="cuda")
+                                  device=self.device)
         torch.cumsum(seq_lens,
                      dim=0,
                      dtype=cu_seq_lens.dtype,
@@ -232,21 +220,21 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
 
         # for local attention
         local_attn_metadata = None
-        if self.runner.attention_chunk_size is not None:
+        if self.model_config.attention_chunk_size is not None:
             seqlens_q_local_np, virt_q_cu_seqlens_np, virt_k_seqlens_np, \
                 virt_block_table_tensor = make_local_attention_virtual_batches(
-                    self.runner.attention_chunk_size,
-                    self.runner.query_start_loc_np[:num_reqs + 1],
-                    self.runner.seq_lens_np[:num_reqs],
+                    self.model_config.attention_chunk_size,
+                    query_start_loc_cpu.numpy(),
+                    seq_lens_cpu.numpy(),
                     block_table_tensor,
                     self.block_size,
                 )
             local_query_start_loc = torch.from_numpy(virt_q_cu_seqlens_np).to(
-                self.runner.device, non_blocking=True)
+                self.device, non_blocking=True)
             local_seqused_k = torch.from_numpy(virt_k_seqlens_np).to(
-                self.runner.device, non_blocking=True)
-            local_max_query_len = int(seqlens_q_local_np.max())
-            local_max_seq_len = int(virt_k_seqlens_np.max())
+                self.device, non_blocking=True)
+            local_max_query_len = seqlens_q_local_np.max().item()
+            local_max_seq_len = virt_k_seqlens_np.max().item()
             local_scheduler_metadata = schedule(
                 batch_size=local_query_start_loc.shape[0] - 1,
                 cu_query_lens=local_query_start_loc,
@@ -257,12 +245,11 @@ def schedule(batch_size, cu_query_lens, max_query_len, seqlens,
 
             local_cu_seq_lens = torch.zeros(virt_k_seqlens_np.shape[0] + 1,
                                             dtype=torch.int32,
-                                            device=self.runner.device)
+                                            device=self.device)
             local_cu_seq_lens[1:] = torch.cumsum(
-                torch.from_numpy(virt_k_seqlens_np).to(
-                    device=self.runner.device,
-                    dtype=torch.int32,
-                    non_blocking=True),
+                torch.from_numpy(virt_k_seqlens_np).to(device=self.device,
+                                                       dtype=torch.int32,
+                                                       non_blocking=True),
                 dim=0)