review comments

LucasWilkinson · LucasWilkinson · commit 834b4962737c · 2025-07-10T23:45:46.000-04:00
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py
@@ -5,21 +5,25 @@
 import pytest
 import torch
 
-from tests.v1.attention.utils import (BatchSpec, create_common_attn_metadata,
+from tests.v1.attention.utils import (BatchSpec, _Backend,
+                                      create_common_attn_metadata,
                                       create_standard_kv_cache_spec,
                                       create_vllm_config,
                                       get_attention_backend)
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, cdiv
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
-BACKENDS_TO_TEST = ["flash_attn", "flashinfer", "flex_attention"]
+BACKENDS_TO_TEST = [
+    _Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
+    _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1
+]
 
 # Remove flashinfer from the list if it's not available
 try:
     import flashinfer  # noqa: F401
 except ImportError:
-    BACKENDS_TO_TEST.remove("flashinfer")
+    BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
 
 
 def _convert_dtype_to_torch(dtype):
@@ -197,18 +201,18 @@ def __init__(self):
         self._v_scale_float = 1.0
 
 
-def run_attention_backend(backend_name: str, kv_cache_spec: FullAttentionSpec,
+def run_attention_backend(backend: _Backend, kv_cache_spec: FullAttentionSpec,
                           vllm_config, device: torch.device,
                           common_attn_metadata: CommonAttentionMetadata,
                           query: torch.Tensor, key: torch.Tensor,
                           value: torch.Tensor,
                           kv_cache: torch.Tensor) -> torch.Tensor:
     """Run attention computation using the specified backend's AttentionImpl."""
 
-    builder_cls, impl_cls = get_attention_backend(backend_name)
+    builder_cls, impl_cls = get_attention_backend(backend)
 
     # Mock flashinfer's get_per_layer_parameters if needed
-    if backend_name == "flashinfer":
+    if backend == _Backend.FLASHINFER_VLLM_V1:
         import unittest.mock
 
         from vllm.v1.attention.backends.flashinfer import PerLayerParameters
@@ -417,7 +421,7 @@ def test_backend_correctness(batch_spec_name: str, model: str):
         #   [num_blocks, 2, block_size, num_kv_heads, head_size]
         # Select the appropriate KV cache format for each backend
         kv_cache_for_backend = kv_cache
-        if backend_name == "flashinfer":
+        if backend_name == _Backend.FLASHINFER_VLLM_V1:
             kv_cache_for_backend = kv_cache.transpose(0, 1)
 
         backend_output = run_attention_backend(backend_name, kv_cache_spec,
@@ -440,17 +444,12 @@ def test_backend_correctness(batch_spec_name: str, model: str):
 
         # Check numerical similarity
         rtol = 1e-2
-        atol = 1e-3
+        atol = 5e-3
 
-        # Flashinfer and Flex_attention may have slightly different
-        #  numerical behavior
-        if backend_name == "flashinfer":
-            atol = 5e-3
-
-        if backend_name == "flex_attention":
-            atol = 5e-1  # TODO: figuure out why flex_attention has such large
-            # numerical differences for
-            #   medium_decode, medium_prefill, mixed_medium
+        if backend_name == _Backend.FLEX_ATTENTION:
+            atol = 5e-1  # TODO: figure out why flex_attention has such large
+            # numerical differences for medium_decode, medium_prefill,
+            # mixed_medium
 
         max_diff = torch.max(torch.abs(backend_output - sdpa_output)).item()
         max_rel_diff = torch.max(
diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py
@@ -11,6 +11,8 @@
 from vllm.config import (CacheConfig, CompilationConfig, DeviceConfig,
                          LoadConfig, ModelConfig, ModelDType, ParallelConfig,
                          SchedulerConfig, VllmConfig)
+from vllm.platforms import _Backend
+from vllm.utils import resolve_obj_by_qualname
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 
@@ -92,7 +94,7 @@ def create_common_attn_metadata(
     )
 
 
-def get_attention_backend(backend_name: str):
+def get_attention_backend(backend_name: _Backend):
     """Set up attention backend classes for testing.
     
     Args:
@@ -103,23 +105,23 @@ def get_attention_backend(backend_name: str):
         Tuple of (backend_builder_class, backend_impl_class)
     """
     backend_map = {
-        "flash_attn":
-        ("vllm.v1.attention.backends.flash_attn", "FlashAttentionBackend"),
-        "flashinfer":
-        ("vllm.v1.attention.backends.flashinfer", "FlashInferBackend"),
-        "flex_attention":
-        ("vllm.v1.attention.backends.flex_attention", "FlexAttentionBackend"),
+        _Backend.FLASH_ATTN_VLLM_V1:
+        "vllm.v1.attention.backends.flash_attn.FlashAttentionBackend",
+        _Backend.FLASHINFER_VLLM_V1:
+        "vllm.v1.attention.backends.flashinfer.FlashInferBackend",
+        _Backend.FLEX_ATTENTION:
+        "vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
+        _Backend.TRITON_ATTN_VLLM_V1:
+        "vllm.v1.attention.backends.triton_attn.TritonAttnBackend",
     }
 
     if backend_name not in backend_map:
         raise ValueError(f"Unknown backend: {backend_name}")
 
-    module_name, backend_class_name = backend_map[backend_name]
+    backend_class_name = backend_map[backend_name]
 
     try:
-        import importlib
-        module = importlib.import_module(module_name)
-        backend_class = getattr(module, backend_class_name)
+        backend_class = resolve_obj_by_qualname(backend_class_name)
         return backend_class.get_builder_cls(), backend_class.get_impl_cls()
     except ImportError as e:
         pytest.skip(f"{backend_name} not available: {e}")
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -208,6 +208,10 @@ def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
               fast_build: bool = False) -> FlashAttentionMetadata:
+        """
+        fast_build disables AOT scheduling, used when there will be few 
+        iterations i.e. spec-decode
+        """
         num_reqs = common_attn_metadata.num_reqs
         num_actual_tokens = common_attn_metadata.num_actual_tokens
         max_query_len = common_attn_metadata.max_query_len