dont transfer block table

LucasWilkinson · LucasWilkinson · commit 19b2d52ddeb0 · 2025-07-17T13:56:33.000-04:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
@@ -169,8 +169,8 @@ class FlashInferMetadata:
     # [0, 3, 6, 8]
     # The indptr of the paged kv cache, shape: [batch_size + 1] (CPU for plan)
     paged_kv_indptr_cpu: torch.Tensor
-    # The page indices of the paged kv cache (CPU for plan)
-    paged_kv_indices_cpu: torch.Tensor
+    # The page indices of the paged kv cache (on device for plan)
+    paged_kv_indices: torch.Tensor
     # The number of entries in the last page of each request in
     # the paged kv cache, shape: [batch_size] (CPU for plan)
     paged_kv_last_page_len_cpu: torch.Tensor
@@ -292,7 +292,7 @@ def _plan(self, num_prefills: int, num_decodes: int,
         # Ensure CPU tensors are not None
         assert attn_metadata.qo_indptr_cpu is not None
         assert attn_metadata.paged_kv_indptr_cpu is not None
-        assert attn_metadata.paged_kv_indices_cpu is not None
+        assert attn_metadata.paged_kv_indices is not None
         assert attn_metadata.paged_kv_last_page_len_cpu is not None
 
         if attn_metadata.use_cascade:
@@ -308,7 +308,7 @@ def _plan(self, num_prefills: int, num_decodes: int,
                 ],
                 [
                     attn_metadata.shared_kv_page_indices_cpu,
-                    attn_metadata.paged_kv_indices_cpu
+                    attn_metadata.paged_kv_indices
                 ],
                 [
                     attn_metadata.shared_kv_last_page_len_cpu,
@@ -347,7 +347,7 @@ def _plan(self, num_prefills: int, num_decodes: int,
                 attn_metadata.prefill_wrapper.plan(
                     qo_indptr_cpu,
                     attn_metadata.paged_kv_indptr_cpu[prefill_start:],
-                    attn_metadata.paged_kv_indices_cpu,
+                    attn_metadata.paged_kv_indices,
                     attn_metadata.paged_kv_last_page_len_cpu[prefill_start:],
                     attn_metadata.num_qo_heads,
                     attn_metadata.num_kv_heads,
@@ -370,7 +370,7 @@ def _plan(self, num_prefills: int, num_decodes: int,
                         attn_metadata.num_kv_heads, attn_metadata.head_dim):
                     attn_metadata.decode_wrapper.plan(
                         attn_metadata.paged_kv_indptr_cpu[:num_decodes + 1],
-                        attn_metadata.paged_kv_indices_cpu,
+                        attn_metadata.paged_kv_indices,
                         attn_metadata.paged_kv_last_page_len_cpu[:num_decodes],
                         attn_metadata.num_qo_heads,
                         attn_metadata.num_kv_heads,
@@ -438,7 +438,7 @@ def build(self,
                                  dtype=torch.int32,
                                  device='cpu').unsqueeze(0)
                     < block_table_bounds_cpu.unsqueeze(1))
-        paged_kv_indices_cpu = block_table_tensor.cpu()[mask_cpu]
+        paged_kv_indices = block_table_tensor[mask_cpu]
 
         # paged_kv_indptr_cpu: cumulative sum of block_table_bounds_cpu
         paged_kv_indptr_cpu = torch.cat([
@@ -462,7 +462,7 @@ def build(self,
             qo_indptr=qo_indptr,
             qo_indptr_cpu=common_attn_metadata.query_start_loc_cpu,
             paged_kv_indptr_cpu=paged_kv_indptr_cpu,
-            paged_kv_indices_cpu=paged_kv_indices_cpu,
+            paged_kv_indices=paged_kv_indices,
             paged_kv_last_page_len_cpu=paged_kv_last_page_len_cpu,
             num_qo_heads=self.vllm_config.model_config.get_num_attention_heads(
                 self.vllm_config.parallel_config),