Skip to content

Commit e34fd18

Browse files
committed
Restrict FP8 attention output to non unified backend until the accuracy issue is resolved
1 parent 9b131ae commit e34fd18

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

vllm/model_executor/models/llama.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,8 @@ def __init__(
215215
and quant_config.is_fp8_w8a8())
216216
self.attn_fp8_out = (envs.VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT
217217
and current_platform.is_fp8_fnuz() and use_fp8)
218+
if envs.VLLM_USE_V1 and not envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
219+
self.attn_fp8_out = False
218220

219221
self.attn = Attention(
220222
self.num_heads,

0 commit comments

Comments
 (0)