Skip to content

Commit 91a5600

Browse files
authored
Fused FP8 attention output is now only possible for both flash and paged. Disabling it in case of flash attention not being used (#545)
1 parent a31e5d8 commit 91a5600

File tree

1 file changed

+1
-0
lines changed

1 file changed

+1
-0
lines changed

vllm/model_executor/models/llama.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ def __init__(
214214
quant_config, Fp8Config) or (isinstance(quant_config, QuarkConfig)
215215
and quant_config.is_fp8_w8a8())
216216
self.attn_fp8_out = (envs.VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT
217+
and envs.VLLM_USE_TRITON_FLASH_ATTN
217218
and current_platform.is_fp8_fnuz() and use_fp8)
218219
if envs.VLLM_USE_V1 and not envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
219220
self.attn_fp8_out = False

0 commit comments

Comments
 (0)