We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 7f0d422 commit 557ee9eCopy full SHA for 557ee9e
vllm/v1/attention/backends/flex_attention.py
@@ -301,9 +301,8 @@ def build(self,
301
block_table_tensor, self.cache_config.num_gpu_blocks)
302
303
# Get the original offset tensor
304
- offset_tensor = torch.tensor(
305
- common_attn_metadata.num_computed_tokens_cpu[:num_reqs]).to(
306
- self.device, non_blocking=True)
+ offset_tensor = common_attn_metadata.num_computed_tokens_cpu.to(
+ self.device, non_blocking=True)
307
308
out = FlexAttentionMetadata(
309
num_actual_tokens=num_actual_tokens,
0 commit comments