Skip to content

Commit 9ad0a45

Browse files
authored
[Bugfix] Switch bailout logic for kv-cache-dtype with SM100 Flashinfer (#20934)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
1 parent 016b8d1 commit 9ad0a45

File tree

1 file changed

+4
-3
lines changed

1 file changed

+4
-3
lines changed

vllm/engine/arg_utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,14 +1418,15 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
14181418
and not envs.is_set("VLLM_ATTENTION_BACKEND")
14191419
) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
14201420
supported = False
1421-
if current_platform.is_rocm():
1421+
if current_platform.is_rocm() or (
1422+
current_platform.is_cuda()
1423+
and current_platform.is_device_capability(100)):
14221424
supported = True
14231425
elif fp8_attention and will_use_fa:
14241426
from vllm.attention.utils.fa_utils import (
14251427
flash_attn_supports_fp8)
14261428
supported = flash_attn_supports_fp8()
1427-
elif envs.VLLM_USE_TRTLLM_DECODE_ATTENTION:
1428-
supported = True
1429+
14291430
if not supported:
14301431
_raise_or_fallback(feature_name="--kv-cache-dtype",
14311432
recommend_to_remove=False)

0 commit comments

Comments
 (0)