move slot table computation into block_table

LucasWilkinson · LucasWilkinson · commit fa85ae090e3e · 2025-07-04T21:55:59.000Z
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
@@ -14,12 +14,14 @@ class BlockTable:
 
     def __init__(
         self,
+        block_size: int,
         max_num_reqs: int,
         max_num_blocks_per_req: int,
         max_num_batched_tokens: int,
         pin_memory: bool,
         device: torch.device,
     ):
+        self.block_size = block_size
         self.max_num_reqs = max_num_reqs
         self.max_num_blocks_per_req = max_num_blocks_per_req
         self.max_num_batched_tokens = max_num_batched_tokens
@@ -79,10 +81,31 @@ def swap_row(self, src: int, tgt: int) -> None:
 
         self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
 
-    def commit(self, num_reqs: int) -> None:
+    def compute_slot_mapping(self, req_indices: np.ndarray,
+                             positions: np.ndarray) -> None:
+        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+        # where K is the max_num_blocks_per_req and the block size is 2.
+        # NOTE(woosuk): We can't simply use `token_indices // block_size`
+        # here because M (max_model_len) is not necessarily divisible by
+        # block_size.
+        block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                               positions // self.block_size)
+        block_table_cpu = self.get_cpu_tensor()
+        block_numbers = block_table_cpu.flatten()[block_table_indices].numpy()
+        block_offsets = positions % self.block_size
+        np.add(block_numbers * self.block_size,
+               block_offsets,
+               out=self.slot_mapping_np[:req_indices.shape[0]])
+
+    def commit_block_table(self, num_reqs: int) -> None:
         self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs],
                                           non_blocking=True)
 
+    def commit_slot_mapping(self, num_tokens: int) -> None:
+        self.slot_mapping[:num_tokens].copy_(
+            self.slot_mapping_cpu[:num_tokens], non_blocking=True)
+
     def clear(self) -> None:
         self.block_table.fill_(0)
         self.block_table_cpu.fill_(0)
@@ -107,7 +130,8 @@ def __init__(self, max_num_reqs: int, max_model_len: int,
                  max_num_batched_tokens: int, pin_memory: bool,
                  device: torch.device, block_sizes: list[int]) -> None:
         self.block_tables = [
-            BlockTable(max_num_reqs, cdiv(max_model_len, block_size),
+            BlockTable(block_size, max_num_reqs, cdiv(max_model_len,
+                                                      block_size),
                        max_num_batched_tokens, pin_memory, device)
             for block_size in block_sizes
         ]
@@ -129,9 +153,18 @@ def swap_row(self, src: int, tgt: int) -> None:
         for block_table in self.block_tables:
             block_table.swap_row(src, tgt)
 
-    def commit(self, num_reqs: int) -> None:
+    def compute_slot_mapping(self, req_indices: np.ndarray,
+                             positions: np.ndarray) -> None:
+        for block_table in self.block_tables:
+            block_table.compute_slot_mapping(req_indices, positions)
+
+    def commit_block_table(self, num_reqs: int) -> None:
+        for block_table in self.block_tables:
+            block_table.commit_block_table(num_reqs)
+
+    def commit_slot_mapping(self, num_tokens: int) -> None:
         for block_table in self.block_tables:
-            block_table.commit(num_reqs)
+            block_table.commit_slot_mapping(num_tokens)
 
     def clear(self) -> None:
         for block_table in self.block_tables:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -4,7 +4,6 @@
 import copy
 import gc
 import time
-import weakref
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Optional, Union
 
@@ -64,7 +63,6 @@
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.spec_decode.utils import is_spec_decode_supported
 from vllm.v1.utils import bind_kv_cache
-from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
 
@@ -610,7 +608,7 @@ def _prepare_inputs(
 
         # OPTIMIZATION: Start copying the block table first.
         # This way, we can overlap the copy with the following CPU operations.
-        self.input_batch.block_table.commit(num_reqs)
+        self.input_batch.block_table.commit_block_table(num_reqs)
 
         # Get the number of scheduled tokens for each request.
         req_ids = self.input_batch.req_ids
@@ -654,29 +652,10 @@ def _prepare_inputs(
                            torch.from_numpy(token_indices),
                            out=self.input_ids_cpu[:total_num_scheduled_tokens])
 
-        # Calculate the slot mapping for each KV cache group.
-        for kv_cache_group_id, kv_cache_group_spec in enumerate(
-                self.kv_cache_config.kv_cache_groups):
-            block_size = kv_cache_group_spec.kv_cache_spec.block_size
-            block_table: BlockTable = self.input_batch.block_table[
-                kv_cache_group_id]
-            # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-            # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
-            # where K is the max_num_blocks_per_req and the block size is 2.
-            # NOTE(woosuk): We can't simply use `token_indices // block_size`
-            # here because M (max_model_len) is not necessarily divisible by
-            # block_size.
-            block_table_indices = (
-                req_indices * block_table.max_num_blocks_per_req +
-                positions_np // block_size)
-            block_table_cpu = block_table.get_cpu_tensor()
-            block_numbers = block_table_cpu.flatten(
-            )[block_table_indices].numpy()
-            block_offsets = positions_np % block_size
-            np.add(
-                block_numbers * block_size,
-                block_offsets,
-                out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
+        self.input_batch.block_table.compute_slot_mapping(
+            req_indices, positions_np)
+        self.input_batch.block_table.commit_slot_mapping(
+            total_num_scheduled_tokens)
 
         # Prepare the attention metadata.
         self.query_start_loc_np[0] = 0
@@ -722,12 +701,6 @@ def _prepare_inputs(
         for kv_cache_group_id, kv_cache_group_spec in enumerate(
                 self.kv_cache_config.kv_cache_groups):
 
-            slot_mapping = self.input_batch.block_table[
-                kv_cache_group_id].slot_mapping[:num_reqs]
-            slot_mapping.copy_(self.input_batch.block_table[kv_cache_group_id].
-                               slot_mapping_np[:num_reqs],
-                               non_blocking=True)
-
             common_attn_metadata = CommonAttentionMetadata(
                 query_start_loc=self.query_start_loc[:num_reqs + 1],
                 query_start_loc_cpu=self.query_start_loc_cpu[:num_reqs + 1],
@@ -740,7 +713,8 @@ def _prepare_inputs(
                 max_query_len=max_num_scheduled_tokens,
                 block_table_tensor=self.input_batch.
                 block_table[kv_cache_group_id].get_device_tensor()[:num_reqs],
-                slot_mapping=slot_mapping,
+                slot_mapping=self.input_batch.block_table[kv_cache_group_id].
+                slot_mapping[:total_num_scheduled_tokens],
             )
 
             if self.speculative_config and \
@@ -1679,8 +1653,7 @@ def propose_draft_token_ids(
                     for i, n in enumerate(num_draft_tokens)
                 ]
                 num_rejected_tokens_cpu = torch.tensor(num_rejected_tokens,
-                                                       dtype=torch.int32,
-                                                       device=self.device)
+                                                       dtype=torch.int32)
                 num_tokens = (num_scheduled_tokens -
                               num_rejected_tokens_cpu.sum())
                 common_attn_metadata, token_indices =\
@@ -2389,11 +2362,10 @@ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
                 raise ValueError(
                     f"Unknown KV cache spec type: {type(kv_cache_spec)}")
 
-            block_table_i = self.input_batch.block_table[i]
             attn_metadata_builder_i = attn_backend_i.get_builder_cls()(
-                weakref.proxy(self),
                 kv_cache_spec,
-                block_table_i,
+                self.vllm_config,
+                self.device,
             )
 
             if (self.full_cuda_graph