|
13 | 13 | from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
|
14 | 14 | AttentionMetadata, AttentionType,
|
15 | 15 | is_quantized_kv_cache)
|
16 |
| -from vllm.distributed import get_tensor_model_parallel_world_size |
17 | 16 | from vllm.logger import init_logger
|
18 | 17 | from vllm.platforms import current_platform
|
19 | 18 | from vllm.v1.attention.backends.utils import (AttentionMetadataBuilder,
|
@@ -237,17 +236,13 @@ def final_mask_mod(
|
237 | 236 |
|
238 | 237 | def build_block_mask(self) -> BlockMask:
|
239 | 238 | assert self.mask_mod is not None
|
240 |
| - # FIXME: With TP>1, create_block_mask_compiled will raise |
241 |
| - # CUDA error: an illegal memory access was encountered |
242 |
| - create_block_mask_fn = (create_block_mask_compiled |
243 |
| - if get_tensor_model_parallel_world_size() == 1 |
244 |
| - else create_block_mask) |
245 |
| - return create_block_mask_fn( |
| 239 | + return create_block_mask_compiled( |
246 | 240 | self.mask_mod,
|
247 | 241 | None,
|
248 | 242 | None,
|
249 | 243 | self.num_actual_tokens,
|
250 | 244 | self.total_cache_tokens,
|
| 245 | + device=self.block_table.device, |
251 | 246 | )
|
252 | 247 |
|
253 | 248 | def __post_init__(self):
|
@@ -429,7 +424,6 @@ def forward(
|
429 | 424 | shape = [num_tokens, num_heads * head_size]
|
430 | 425 | """
|
431 | 426 | assert output is not None, "Output tensor must be provided."
|
432 |
| - |
433 | 427 | if output_scale is not None:
|
434 | 428 | raise NotImplementedError(
|
435 | 429 | "fused output quantization is not yet supported"
|
|
0 commit comments