40
40
from vllm .sampling_params import SamplingType
41
41
from vllm .sequence import IntermediateTensors
42
42
from vllm .utils import (STR_DTYPE_TO_TORCH_DTYPE , DeviceMemoryProfiler ,
43
- GiB_bytes , LazyLoader , async_tensor_h2d , cdiv ,
44
- check_use_alibi , get_dtype_size ,
45
- is_pin_memory_available )
43
+ GiB_bytes , LazyLoader , cdiv , check_use_alibi ,
44
+ get_dtype_size , is_pin_memory_available )
46
45
from vllm .v1 .attention .backends .mamba_attn import Mamba2AttentionBackend
47
46
from vllm .v1 .attention .backends .utils import (AttentionMetadataBuilder ,
48
47
CommonAttentionMetadata )
@@ -1694,7 +1693,7 @@ def execute_model(
1694
1693
finished_recving = finished_recving ,
1695
1694
num_nans_in_logits = num_nans_in_logits ,
1696
1695
)
1697
-
1696
+
1698
1697
def get_valid_sampled_token_ids (
1699
1698
self , max_gen_len : int , sampled_token_ids : torch .Tensor ,
1700
1699
discard_sampled_tokens_req_indices : np .ndarray ) -> list [list [int ]]:
@@ -1715,7 +1714,6 @@ def get_valid_sampled_token_ids(
1715
1714
1716
1715
return valid_sampled_token_ids
1717
1716
1718
-
1719
1717
def kv_connector_no_forward (
1720
1718
self , scheduler_output : "SchedulerOutput" ) -> ModelRunnerOutput :
1721
1719
# KV send/recv even if no work to do.
0 commit comments