Toggle for v1 attention

gshtras · gshtras · commit ccd96e812a57 · 2025-05-16T18:15:55.000Z
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -269,8 +269,8 @@ def chunked_prefill_paged_decode(
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
     if "fp8" in kv_cache_dtype:
-        assert key_cache.dtype == torch.uint8
-        assert value_cache.dtype == torch.uint8
+        assert key_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert value_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
 
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             target_dtype = current_platform.fp8_dtype()
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
@@ -749,8 +749,8 @@ def context_attention_fwd(q,
     # Conversion of FP8 Tensor from uint8 storage to
     # appropriate torch.dtype for interpretation by Triton
     if "fp8" in kv_cache_dtype:
-        assert (k_cache.dtype == torch.uint8)
-        assert (v_cache.dtype == torch.uint8)
+        assert k_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
+        assert v_cache.dtype in [torch.uint8, current_platform.fp8_dtype()]
 
         if kv_cache_dtype in ("fp8", "fp8_e4m3"):
             target_dtype = current_platform.fp8_dtype()
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -20,6 +20,7 @@
     VLLM_USE_TRITON_FLASH_ATTN: bool = True
     VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True
     VLLM_USE_ROCM_FP8_FLASH_ATTN: bool = False
+    VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
     CUDA_VISIBLE_DEVICES: Optional[str] = None
@@ -331,6 +332,13 @@ def get_vllm_port() -> Optional[int]:
     lambda: (os.getenv("VLLM_USE_ROCM_FP8_FLASH_ATTN", "False").lower() in
              ("true", "1")),
 
+    # Use separate prefill and decode kernels for V1 attention instead of
+    # the unified triton kernel.
+    "VLLM_V1_USE_PREFILL_DECODE_ATTENTION":
+    lambda:
+    (os.getenv("VLLM_V1_USE_PREFILL_DECODE_ATTENTION", "False").lower() in
+     ("true", "1")),
+
     # Internal flag to enable/disable Inductor standalone compile
     "VLLM_TEST_STANDALONE_COMPILE":
     lambda: os.environ.get("VLLM_TEST_STANDALONE_COMPILE", "0") != "0",
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm import _custom_ops as ops
+from vllm import envs
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.chunked_prefill_paged_decode import (
@@ -167,8 +168,8 @@ def forward(
         # performance to make sure it does not introduce any overhead.
 
         num_queries_per_kv = query.shape[1] // key.shape[1]
-        use_prefill_decode_attn = (num_queries_per_kv &
-                                   (num_queries_per_kv - 1)) != 0
+        use_prefill_decode_attn = envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION or (
+            (num_queries_per_kv & (num_queries_per_kv - 1)) != 0)
 
         num_actual_tokens = attn_metadata.num_actual_tokens