|
59 | 59 | KVCacheSpec)
|
60 | 60 | from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors,
|
61 | 61 | ModelRunnerOutput)
|
| 62 | +from vllm.v1.pool.metadata import PoolingMetadata |
62 | 63 | from vllm.v1.sample.metadata import SamplingMetadata
|
63 | 64 | from vllm.v1.sample.sampler import Sampler
|
64 | 65 | from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
|
76 | 77 | from vllm_ascend.attention.mla_v1 import (AscendMLAMetadata,
|
77 | 78 | CommonAttentionMetadata)
|
78 | 79 | from vllm_ascend.platform import NPUPlatform
|
79 |
| -from vllm_ascend.pool.metadata import PoolingMetadata |
80 | 80 | from vllm_ascend.sample.rejection_sampler import AscendRejectionSampler
|
81 | 81 | from vllm_ascend.utils import (ACL_FORMAT_FRACTAL_ND, ACL_FORMAT_FRACTAL_NZ,
|
82 | 82 | ProfileExecuteDuration,
|
@@ -571,7 +571,10 @@ def get_eagle_atten_dict(
|
571 | 571 |
|
572 | 572 | # OPTIMIZATION: Start copying the block table first.
|
573 | 573 | # This way, we can overlap the copy with the following CPU operations.
|
574 |
| - self.input_batch.block_table.commit(num_reqs) |
| 574 | + if vllm_version_is("0.9.2"): |
| 575 | + self.input_batch.block_table.commit(num_reqs) |
| 576 | + else: |
| 577 | + self.input_batch.block_table.commit_block_table(num_reqs) |
575 | 578 |
|
576 | 579 | # Get the number of scheduled tokens for each request.
|
577 | 580 | req_ids = self.input_batch.req_ids
|
@@ -902,7 +905,10 @@ def _process_reqs(
|
902 | 905 |
|
903 | 906 | # OPTIMIZATION: Start copying the block table first.
|
904 | 907 | # This way, we can overlap the copy with the following CPU operations.
|
905 |
| - self.input_batch.block_table.commit(num_reqs) |
| 908 | + if vllm_version_is("0.9.2"): |
| 909 | + self.input_batch.block_table.commit(num_reqs) |
| 910 | + else: |
| 911 | + self.input_batch.block_table.commit_block_table(num_reqs) |
906 | 912 |
|
907 | 913 | # Get the number of scheduled tokens for each request.
|
908 | 914 | # TODO: The Python loop can be slow. Optimize.
|
|
0 commit comments