more refactors

LucasWilkinson · LucasWilkinson · commit 4160c1e30909 · 2025-07-10T23:45:46.000-04:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
@@ -201,6 +201,7 @@
 from vllm.attention.backends.utils import get_mla_dims
 from vllm.attention.ops.merge_attn_states import merge_attn_states
 from vllm.attention.utils.fa_utils import get_flash_attn_version
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                LinearBase,
@@ -214,7 +215,6 @@
                                               reoder_batch_to_split_decodes_and_prefills,
                                               split_decodes_and_prefills)
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.block_table import BlockTable
 
 try:
     from vllm.vllm_flash_attn import flash_attn_varlen_func
@@ -234,7 +234,6 @@
 if TYPE_CHECKING:
     from vllm.v1.core.sched.output import SchedulerOutput
     from vllm.v1.worker.gpu_input_batch import InputBatch
-    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 
 logger = init_logger(__name__)
 
@@ -377,22 +376,23 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
     """
 
     def __init__(self,
-                 runner: "GPUModelRunner",
                  kv_cache_spec: AttentionSpec,
-                 block_table: BlockTable,
+                 vllm_config: VllmConfig,
+                 device: torch.device,
                  metadata_cls: Optional[type[M]] = None):
         self.metadata_cls = metadata_cls \
             if metadata_cls is not None else MLACommonMetadata
-        self.runner = runner
-        scheduler_config = runner.scheduler_config
-        model_config = runner.model_config
-        cache_config = runner.cache_config
+        self.kv_cache_spec = kv_cache_spec
+        self.device = device
+        scheduler_config = vllm_config.scheduler_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        parallel_config = vllm_config.parallel_config
         self.chunked_prefill_enabled = scheduler_config.chunked_prefill_enabled
-        self.num_heads = model_config.get_num_attention_heads(
-            runner.parallel_config)
-        self.mla_dims = get_mla_dims(model_config)
+        self.num_heads = self.model_config.get_num_attention_heads(
+            parallel_config)
+        self.mla_dims = get_mla_dims(self.model_config)
         self.aot_schedule = current_platform.is_cuda()
-        self.kv_cache_spec = kv_cache_spec
 
         # Dont try to access the runner on AMD
         if self.aot_schedule:
@@ -403,7 +403,7 @@ def __init__(self,
                 # Max sure there is enough for 8 full length request or at least
                 # 4 pages of cache per request
                 max(
-                    8 * model_config.max_model_len, 4 *
+                    8 * self.model_config.max_model_len, 4 *
                     scheduler_config.max_num_seqs * cache_config.block_size),
                 # For long-context models try not to over-allocate limiting
                 # kv-cache space, limiting it to 64k tokens,
@@ -418,11 +418,10 @@ def __init__(self,
                 scheduler_config.max_num_seqs * cache_config.block_size
             self.chunked_prefill_workspace = torch.empty(
                 (self.chunked_prefill_workspace_size,
-                 model_config.get_head_size()),
-                dtype=model_config.dtype,
-                device=runner.device,
+                 self.model_config.get_head_size()),
+                dtype=self.model_config.dtype,
+                device=device,
             )
-        self.block_table = block_table
 
         self._use_fi_prefill = use_flashinfer_prefill()
         self.prefill_metadata_cls = FlashInferPrefillMetadata \
@@ -558,7 +557,7 @@ def build(self,
         # Note(simon): be careful about the CPU <> GPU memory movement in this
         # function. We should avoid GPU -> CPU sync as much as possible because
         # it blocks on all previous kernels.
-        device = self.runner.device
+        device = self.device
         block_table_tensor = common_attn_metadata.block_table_tensor
         slot_mapping = common_attn_metadata.slot_mapping
 
@@ -664,7 +663,7 @@ def build(self,
             num_actual_tokens=num_tokens,
             query_start_loc=query_start_loc,
             slot_mapping=slot_mapping,
-            head_dim=self.runner.model_config.get_head_size(),
+            head_dim=self.model_config.get_head_size(),
             # MLACommonMetadata Chunk prefill specific
             num_decodes=num_decodes,
             num_decode_tokens=num_decode_tokens,
diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py
@@ -11,14 +11,14 @@
 from vllm.attention.ops.flashmla import (flash_mla_with_kvcache,
                                          get_mla_metadata,
                                          is_flashmla_supported)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
                                                    MLACommonDecodeMetadata,
                                                    MLACommonImpl,
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.block_table import BlockTable
 
 logger = init_logger(__name__)
 
@@ -56,12 +56,13 @@ class FlashMLAMetadata(MLACommonMetadata[FlashMLADecodeMetadata]):
 class FlashMLAMetadataBuilder(MLACommonMetadataBuilder[FlashMLAMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True  # Decode-only
 
-    def __init__(self, runner, kv_cache_spec: AttentionSpec,
-                 block_table: BlockTable):
-        super().__init__(runner, kv_cache_spec, block_table, FlashMLAMetadata)
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        super().__init__(kv_cache_spec, vllm_config, device, FlashMLAMetadata)
 
-        self.num_q_heads = self.runner.model_config.get_num_attention_heads(
-            self.runner.parallel_config)
+        self.compilation_config = vllm_config.compilation_config
+        self.num_q_heads = vllm_config.model_config.get_num_attention_heads(
+            vllm_config.parallel_config)
 
         self.cg_buf_tile_scheduler_metadata = None
         self.cg_buf_num_splits = None
@@ -75,7 +76,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             1, # MQA for the decode path
         )
 
-        if self.runner.full_cuda_graph:
+        if self.compilation_config.full_cuda_graph:
             # First time around (CUDAGraph capture), allocate the static buffer
             if self.cg_buf_tile_scheduler_metadata is None:
                 self.cg_buf_tile_scheduler_metadata = tile_scheduler_metadata
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -8,6 +8,7 @@
 
 import vllm.envs as envs
 from vllm.attention.ops.rocm_aiter_mla import aiter_mla_decode_fwd
+from vllm.config import VllmConfig
 # yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.v1.attention.backends.mla.common import (MLACommonBackend,
@@ -16,7 +17,6 @@
                                                    MLACommonMetadata,
                                                    MLACommonMetadataBuilder)
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.block_table import BlockTable
 
 # yapf: enable
 
@@ -65,24 +65,25 @@ class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
 class AiterMLAMetadataBuilder(MLACommonMetadataBuilder[AiterMLAMetadata]):
     full_cudagraph_supported: ClassVar[bool] = True  # decode only
 
-    def __init__(self, runner, kv_cache_spec: AttentionSpec,
-                 block_table: BlockTable):
-        super().__init__(runner, kv_cache_spec, block_table, AiterMLAMetadata)
+    def __init__(self, kv_cache_spec: AttentionSpec, vllm_config: VllmConfig,
+                 device: torch.device):
+        super().__init__(kv_cache_spec, vllm_config, device, AiterMLAMetadata)
         assert self.kv_cache_spec.block_size == 1, "AITER MLA" \
             "only supports block size 1."
 
+        self.compilation_config = vllm_config.compilation_config
+
         # Preparing persistent buffers
-        if self.runner.full_cuda_graph:
-            device = self.runner.device
-            max_num_reqs = self.runner.max_num_reqs
+        if vllm_config.compilation_config.full_cuda_graph:
+            max_num_reqs = vllm_config.scheduler_config.max_num_seqs
             self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
                                                dtype=torch.int32,
                                                device=device)
-            self.paged_kv_indices = torch.zeros(
-                block_table.get_device_tensor().numel(
-                ),  # max num pages possible
-                dtype=torch.int32,
-                device=device)
+            # We'll assume a reasonable max number of pages
+            max_pages = max_num_reqs * 1024  # Rough estimate
+            self.paged_kv_indices = torch.zeros(max_pages,
+                                                dtype=torch.int32,
+                                                device=device)
             self.paged_kv_last_page_len = torch.zeros(max_num_reqs,
                                                       dtype=torch.int32,
                                                       device=device)
@@ -96,7 +97,8 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
                       seq_lens: torch.Tensor) -> AiterMLADecodeMetadata:
         page_size = self.kv_cache_spec.block_size
         block_table_bounds = (seq_lens + page_size - 1) // page_size
-        device = self.runner.device
+        device = self.device
+        num_reqs = seq_lens.size(0)
 
         mask = (torch.arange(block_table_tensor.size(1),
                              dtype=block_table_tensor.dtype,
@@ -113,8 +115,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             block_table_bounds.cumsum(dim=0, dtype=torch.int32)
         ])
 
-        if self.runner.full_cuda_graph:
-            num_reqs = self._num_decodes
+        if self.compilation_config.full_cuda_graph:
 
             num_actual_pages = paged_kv_indices.size(0)
 
@@ -137,7 +138,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
 
         else:
             qo_indptr = torch.arange(0,
-                                     self._num_decodes + 1,
+                                     num_reqs + 1,
                                      step=1,
                                      dtype=torch.int32,
                                      device=device)