Fused FP8 attention output is now only possible for both flash and paged. Disabling it in case of flash attention not being used (#545)

gshtras · web-flow · commit 91a56009841e · 2025-05-21T10:51:33.000-04:00
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -214,6 +214,7 @@ def __init__(
             quant_config, Fp8Config) or (isinstance(quant_config, QuarkConfig)
                                          and quant_config.is_fp8_w8a8())
         self.attn_fp8_out = (envs.VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT
+                             and envs.VLLM_USE_TRITON_FLASH_ATTN
                              and current_platform.is_fp8_fnuz() and use_fp8)
         if envs.VLLM_USE_V1 and not envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
             self.attn_fp8_out = False