We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent c57c941 commit 770e5dcCopy full SHA for 770e5dc
vllm/v1/worker/gpu_model_runner.py
@@ -655,7 +655,10 @@ def _prepare_inputs(
655
656
# Fill unused with -1. Needed for reshape_and_cache
657
self.seq_lens[num_reqs:].fill_(0)
658
- self.query_start_loc[num_reqs + 1:].fill_(-1)
+ # Note: pad query_start_loc to be non-decreasing, as kernels
659
+ # like FlashAttention requires that
660
+ self.query_start_loc[num_reqs + 1:].fill_(
661
+ self.query_start_loc_cpu[num_reqs].item())
662
663
query_start_loc = self.query_start_loc[:num_reqs + 1]
664
seq_lens = self.seq_lens[:num_reqs]
0 commit comments