[TPU] address comments

yaochengji · yaochengji · commit 1c815133891f · 2025-07-15T22:40:04.000Z
Signed-off-by: Chengji Yao &lt;chengjiyao@google.com&gt;
diff --git a/vllm/v1/attention/backends/pallas.py b/vllm/v1/attention/backends/pallas.py
@@ -368,10 +368,12 @@ def get_page_size_bytes(block_size: int, num_kv_heads: int, head_size: int,
     """Returns the size in bytes of one page of the KV cache."""
     padded_head_size = cdiv(head_size,
                             TPU_HEAD_SIZE_ALIGNMENT) * TPU_HEAD_SIZE_ALIGNMENT
+    num_combined_kv_heads = num_kv_heads * 2
+
+    # NOTE: for the implicit padding in XLA
     packing = get_dtype_packing(kv_cache_dtype)
-    # for the implicit padding in XLA
-    padded_head_size = max(padded_head_size, packing)
+    num_combined_kv_heads = cdiv(num_kv_heads * 2, packing) * packing
+
     kv_cache_dtype_bits = dtype_bits(kv_cache_dtype)
-    num_combined_kv_heads = num_kv_heads * 2
     return (block_size * num_combined_kv_heads * padded_head_size *
             kv_cache_dtype_bits // 8)