File tree Expand file tree Collapse file tree 1 file changed +7
-2
lines changed
vllm/model_executor/models Expand file tree Collapse file tree 1 file changed +7
-2
lines changed Original file line number Diff line number Diff line change 43
43
PromptReplacement , PromptUpdate ,
44
44
PromptUpdateDetails )
45
45
from vllm .multimodal .profiling import BaseDummyInputsBuilder , ProcessorInputs
46
+ from vllm .platforms import current_platform
46
47
from vllm .sequence import IntermediateTensors
47
48
from vllm .transformers_utils .tokenizer import (MistralTokenizer ,
48
49
cached_tokenizer_from_config )
54
55
55
56
try :
56
57
from xformers import ops as xops
57
- USE_XFORMERS_OPS = True
58
+ if (current_platform .is_cuda ()
59
+ and current_platform .has_device_capability (100 )):
60
+ # Xformers FA is not compatible with B200
61
+ USE_XFORMERS_OPS = False
62
+ else :
63
+ USE_XFORMERS_OPS = True
58
64
except ImportError :
59
65
USE_XFORMERS_OPS = False
60
66
@@ -1082,7 +1088,6 @@ def forward(
1082
1088
# Transpose q and k back for attention
1083
1089
q = q .transpose (1 , 2 ).contiguous ()
1084
1090
k = k .transpose (1 , 2 ).contiguous ()
1085
-
1086
1091
out = xops .memory_efficient_attention (q ,
1087
1092
k ,
1088
1093
v ,
You can’t perform that action at this time.
0 commit comments