Merge branch 'vllm-project:main' into large-block-size-solution

WorldExplored · web-flow · commit c6deb39b4426 · 2025-07-18T16:28:43.000-07:00
diff --git a/benchmarks/kv_cache/benchmark_block_pool.py b/benchmarks/kv_cache/benchmark_block_pool.py
@@ -0,0 +1,108 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+import time
+from typing import Optional
+
+from tabulate import tabulate
+
+from vllm.utils import FlexibleArgumentParser
+from vllm.v1.core.block_pool import BlockPool
+
+
+class Metric:
+    def __init__(self) -> None:
+        self.cnt: int = 0
+        self.sum_v: int = 0
+        self.max_v: Optional[int] = None
+
+    def update(self, v: int) -> None:
+        self.cnt += 1
+        self.sum_v += v
+        if self.max_v is None:
+            self.max_v = v
+        else:
+            self.max_v = max(self.max_v, v)
+
+    def avg_v(self) -> float:
+        return self.sum_v * 1.0 / self.cnt
+
+
+def main(args):
+    rows = []
+    for allocate_block in args.allocate_blocks:
+        # Enforce a GC collect ahead to minimize the impact among runs
+        gc.collect()
+        block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
+
+        get_blocks_metric: Metric = Metric()
+        free_blocks_metric: Metric = Metric()
+        for _ in range(args.num_iteration):
+            t1 = time.monotonic_ns()
+            blocks = block_pool.get_new_blocks(allocate_block)
+            t2 = time.monotonic_ns()
+            block_pool.free_blocks(blocks)
+            t3 = time.monotonic_ns()
+            get_blocks_metric.update(t2 - t1)
+            free_blocks_metric.update(t3 - t2)
+
+        if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
+            rows.append(
+                [
+                    get_blocks_metric.cnt,
+                    args.num_gpu_blocks,
+                    allocate_block,
+                    get_blocks_metric.avg_v() / 1000000,
+                    get_blocks_metric.max_v / 1000000.0,
+                    free_blocks_metric.avg_v() / 1000000,
+                    free_blocks_metric.max_v / 1000000.0,
+                ]
+            )
+        else:
+            print(
+                "No valid metrics found."
+                f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
+            )
+
+    print(
+        tabulate(
+            rows,
+            headers=[
+                "Iterations",
+                "Total\nBlocks",
+                "Allocated\nBlocks",
+                "Get Blocks\nAvg (ms)",
+                "Get Blocks\nMax (ms)",
+                "Free Blocks\nAvg (ms)",
+                "Free Blocks\nMax (ms)",
+            ],
+            tablefmt="grid",
+            floatfmt=".6f",
+        )
+    )
+
+
+def invoke_main() -> None:
+    parser = FlexibleArgumentParser(
+        description="Benchmark the performance of BlockPool for KV Cache."
+    )
+    parser.add_argument("--num-gpu-blocks", type=int, default=100000)
+    parser.add_argument(
+        "--num-iteration",
+        type=int,
+        default=1000,
+        help="Number of iterations to run to stablize final data readings",
+    )
+    parser.add_argument(
+        "--allocate-blocks",
+        type=int,
+        nargs="*",
+        default=[10, 50, 100, 500, 1000],
+        help="Number of blocks to allocate",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -20,13 +20,17 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // vLLM custom ops
   //
 
-  // The default behavior in PyTorch 2.6 is "requires_contiguous", so we need
+  // The default behavior in PyTorch 2.6 was changed to "requires_contiguous",
+  // so we need
   // to override this for many GEMMs with the following tag. Otherwise,
   // torch.compile will force all input tensors to be contiguous(), which
   // will break many custom ops that require column-major weight matrices.
-  // TODO: remove this for PyTorch 2.8, when the default is planned to switch
-  // to match exact eager-mode strides.
-  at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
+  // This was a bug and PyTorch 2.7 has since fixed this.
+#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
+  #define stride_tag at::Tag::needs_fixed_stride_order
+#else
+  #define stride_tag
+#endif
 
   ops.def("weak_ref_tensor(Tensor input) -> Tensor");
   ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
@@ -132,8 +132,8 @@ def test_free_kv_cache_block_queue_initialization():
     block = KVCacheBlock(block_id=0)
     queue = FreeKVCacheBlockQueue([block])
     assert queue.num_free_blocks == 1
-    assert queue.free_list_head == block
-    assert queue.free_list_tail == block
+    assert queue.fake_free_list_head.next_free_block is block
+    assert queue.fake_free_list_tail.prev_free_block is block
 
 
 def test_free_kv_cache_block_queue_operations():
@@ -145,36 +145,38 @@ def test_free_kv_cache_block_queue_operations():
 
     # Check initial state
     assert queue.num_free_blocks == 5
-    assert queue.free_list_head == blocks[0]
-    assert queue.free_list_tail == blocks[4]
+    assert queue.fake_free_list_head.next_free_block is blocks[0]
+    assert queue.fake_free_list_tail.prev_free_block is blocks[4]
 
     # Pop the first block
     block1 = queue.popleft()
     assert block1 == blocks[0]
     assert queue.num_free_blocks == 4
-    assert queue.free_list_head == blocks[1]
-    assert queue.free_list_tail == blocks[4]
+    assert queue.fake_free_list_head.next_free_block is blocks[1]
+    assert queue.fake_free_list_tail.prev_free_block is blocks[4]
 
     # Remove a block from the middle
     block_to_remove = blocks[2]
     queue.remove(block_to_remove)
     assert queue.num_free_blocks == 3
-    assert blocks[1].next_free_block == blocks[3]
-    assert blocks[3].prev_free_block == blocks[1]
+    assert blocks[1].next_free_block is blocks[3]
+    assert blocks[3].prev_free_block is blocks[1]
 
     # Append a block back
     queue.append(block_to_remove)
     assert queue.num_free_blocks == 4
-    assert queue.free_list_tail == block_to_remove
-    assert block_to_remove.prev_free_block == blocks[4]
-    assert block_to_remove.next_free_block is None
+    assert queue.fake_free_list_tail.prev_free_block is block_to_remove
+    assert block_to_remove.prev_free_block is blocks[4]
+    assert block_to_remove.next_free_block is queue.fake_free_list_tail
 
     # Pop blocks until empty
     for _ in range(4):
         queue.popleft()
     assert queue.num_free_blocks == 0
-    assert queue.free_list_head is None
-    assert queue.free_list_tail is None
+    assert (queue.fake_free_list_head.next_free_block
+            is queue.fake_free_list_tail)
+    assert (queue.fake_free_list_tail.prev_free_block
+            is queue.fake_free_list_head)
 
     # Attempt to pop from an empty queue
     with pytest.raises(ValueError) as e:
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
@@ -155,13 +155,14 @@ def test_prefill(hash_algo):
         assert block.ref_cnt == 2
 
     # At this point, we should have 5 free blocks left.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 5
+    free_block_queue = manager.block_pool.free_block_queue
+    assert free_block_queue.num_free_blocks == 5
 
     manager.free(req0)
     manager.free(req1)
 
     # All blocks should be available.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 10
+    assert free_block_queue.num_free_blocks == 10
     # The order should be
     # [unallocated (6, 7, 8, 9, 10)]
     # [unique_req0 (4)]
@@ -188,14 +189,10 @@ def test_prefill(hash_algo):
 
     # Although we only have 6 free blocks, we have 8 blocks in
     # the free block queue due to lazy removal.
-    assert manager.block_pool.free_block_queue.num_free_blocks == 6
-    assert all([
-        b.ref_cnt == 0
-        for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ])
-    assert len([
-        b for b in manager.block_pool.free_block_queue.get_all_free_blocks()
-    ]) == 6
+    assert free_block_queue.num_free_blocks == 6
+    assert all(
+        [b.ref_cnt == 0 for b in free_block_queue.get_all_free_blocks()])
+    assert len([b for b in free_block_queue.get_all_free_blocks()]) == 6
 
     manager.free(req2)
 
@@ -209,9 +206,12 @@ def test_prefill(hash_algo):
                                     computed_blocks)
     # This block ID order also checks the eviction order.
     assert blocks.get_block_ids() == ([7, 8, 9, 10, 4, 5, 6, 3, 2, 1], )
-    assert manager.block_pool.free_block_queue.num_free_blocks == 0
-    assert manager.block_pool.free_block_queue.free_list_head is None
-    assert manager.block_pool.free_block_queue.free_list_tail is None
+
+    assert free_block_queue.num_free_blocks == 0
+    assert (free_block_queue.fake_free_list_head.next_free_block
+            is free_block_queue.fake_free_list_tail)
+    assert (free_block_queue.fake_free_list_tail.prev_free_block
+            is free_block_queue.fake_free_list_head)
 
 
 def test_prefill_hybrid_model():
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -16,13 +16,42 @@
                                           has_kv_transfer_group,
                                           is_v1_kv_transfer_group)
 from vllm.forward_context import ForwardContext, get_forward_context
+from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
 
+logger = init_logger(__name__)
+USE_XFORMERS_OPS = None
+
+
+def check_xformers_availability():
+    global USE_XFORMERS_OPS
+    if USE_XFORMERS_OPS is not None:
+        return USE_XFORMERS_OPS
+
+    if current_platform.is_cuda() and current_platform.has_device_capability(
+            100):
+        # Xformers FA is not compatible with B200
+        USE_XFORMERS_OPS = False
+    else:
+        try:
+            from importlib.util import find_spec
+
+            find_spec("xformers.ops")
+            USE_XFORMERS_OPS = True
+        except ImportError:
+            USE_XFORMERS_OPS = False
+
+    # the warning only needs to be shown once
+    if not USE_XFORMERS_OPS:
+        logger.warning("Xformers is not available, falling back.")
+
+    return USE_XFORMERS_OPS
+
 
 class Attention(nn.Module):
     """Attention layer.
@@ -314,6 +343,10 @@ def __init__(
                 _Backend.TORCH_SDPA, _Backend.XFORMERS, _Backend.PALLAS_VLLM_V1
             } else _Backend.TORCH_SDPA
 
+        if (self.attn_backend == _Backend.XFORMERS
+                and not check_xformers_availability()):
+            self.attn_backend = _Backend.TORCH_SDPA
+
     def forward(
         self,
         query: torch.Tensor,
diff --git a/vllm/attention/ops/rocm_aiter_mla.py b/vllm/attention/ops/rocm_aiter_mla.py
@@ -6,7 +6,7 @@
 import torch
 
 from vllm.platforms import current_platform
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
 
 
 def get_aiter_mla_metadata(max_batch_size: int, block_size: int,
@@ -93,8 +93,12 @@ def mla_decode_fwd_fake(
 
 
 if current_platform.is_rocm():
+    if is_torch_equal_or_newer("2.7.0"):
+        tags = ()
+    else:
+        tags = (torch.Tag.needs_fixed_stride_order, ),
     direct_register_custom_op(op_name="rocm_aiter_mla_decode_fwd",
                               op_func=mla_decode_fwd_impl,
                               mutates_args=["o"],
                               fake_impl=mla_decode_fwd_fake,
-                              tags=[torch.Tag.needs_fixed_stride_order])
+                              tags=tags)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -33,7 +33,7 @@
     dequant_mxfp4)
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.utils import direct_register_custom_op
+from vllm.utils import direct_register_custom_op, is_torch_equal_or_newer
 from vllm.utils.deep_gemm import is_blackwell_deep_gemm_used
 
 from .rocm_aiter_fused_moe import is_rocm_aiter_moe_enabled
@@ -1056,7 +1056,8 @@ def inplace_fused_experts_fake(
     op_func=inplace_fused_experts,
     mutates_args=["hidden_states"],
     fake_impl=inplace_fused_experts_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(() if is_torch_equal_or_newer("2.7.0") else
+          (torch.Tag.needs_fixed_stride_order, )),
 )
 
 
@@ -1122,7 +1123,8 @@ def outplace_fused_experts_fake(
     op_func=outplace_fused_experts,
     mutates_args=[],
     fake_impl=outplace_fused_experts_fake,
-    tags=(torch.Tag.needs_fixed_stride_order, ),
+    tags=(() if is_torch_equal_or_newer("2.7.0") else
+          (torch.Tag.needs_fixed_stride_order, )),
 )
 
 
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py