We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 662127a commit 9b131aeCopy full SHA for 9b131ae
vllm/v1/attention/backends/triton_attn.py
@@ -127,6 +127,7 @@ def __init__(
127
"TritonAttentionImpl")
128
129
self.fp8_dtype = current_platform.fp8_dtype()
130
+ self.use_prefill_decode_attn = envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION
131
132
def forward(
133
self,
@@ -168,7 +169,7 @@ def forward(
168
169
# performance to make sure it does not introduce any overhead.
170
171
num_queries_per_kv = query.shape[1] // key.shape[1]
- use_prefill_decode_attn = envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION or (
172
+ use_prefill_decode_attn = self.use_prefill_decode_attn or (
173
(num_queries_per_kv & (num_queries_per_kv - 1)) != 0)
174
175
num_actual_tokens = attn_metadata.num_actual_tokens
0 commit comments