|
13 | 13 | is_quantized_kv_cache)
|
14 | 14 | from vllm.attention.backends.utils import CommonAttentionState
|
15 | 15 | from vllm.logger import init_logger
|
16 |
| -from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder, |
17 |
| - CommonAttentionMetadata) |
| 16 | +from vllm.v1.attention.backends.utils import CommonAttentionMetadata |
18 | 17 | from vllm.v1.core.sched.output import SchedulerOutput
|
19 | 18 | from vllm.v1.kv_cache_interface import AttentionSpec
|
20 | 19 | from vllm.v1.worker.block_table import BlockTable
|
@@ -310,7 +309,7 @@ def get_seq_len_block_table_args(
|
310 | 309 | raise AttributeError(f"Invalid attention type {str(attn_type)}")
|
311 | 310 |
|
312 | 311 |
|
313 |
| -class TorchSDPAMetadataBuilderV1(AttentionMetadataBuilder[TorchSDPAMetadata]): |
| 312 | +class TorchSDPAMetadataBuilderV1: |
314 | 313 |
|
315 | 314 | def __init__(self, runner: CPUModelRunner, kv_cache_spec: AttentionSpec,
|
316 | 315 | block_table: BlockTable) -> None:
|
@@ -374,10 +373,8 @@ def reorder_batch(self, input_batch: InputBatch,
|
374 | 373 |
|
375 | 374 | return True
|
376 | 375 |
|
377 |
| - def build(self, |
378 |
| - common_prefix_len: int, |
379 |
| - common_attn_metadata: CommonAttentionMetadata, |
380 |
| - fast_build: bool = False) -> TorchSDPAMetadata: |
| 376 | + def build(self, common_prefix_len: int, |
| 377 | + common_attn_metadata: CommonAttentionMetadata): |
381 | 378 | num_reqs = common_attn_metadata.num_reqs
|
382 | 379 | num_actual_tokens = common_attn_metadata.num_actual_tokens
|
383 | 380 | max_query_len = common_attn_metadata.max_query_len
|
|
0 commit comments