|
14 | 14 | from vllm import _custom_ops as ops
|
15 | 15 | from vllm.logger import init_logger
|
16 | 16 | from vllm.model_executor.layers.fused_moe import (
|
17 |
| - CutlassExpertsFp8, FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, |
18 |
| - FusedMoEMethodBase, FusedMoEPermuteExpertsUnpermute, |
19 |
| - FusedMoEPrepareAndFinalize, FusedMoeWeightScaleSupported, fused_experts) |
| 17 | + FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase, |
| 18 | + FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize, |
| 19 | + FusedMoeWeightScaleSupported) |
20 | 20 | from vllm.model_executor.layers.quantization.compressed_tensors.schemes.compressed_tensors_wNa16 import ( # noqa
|
21 | 21 | WNA16_SUPPORTED_BITS, WNA16_SUPPORTED_TYPES_MAP)
|
22 | 22 | from vllm.model_executor.layers.quantization.utils import replace_parameter
|
@@ -570,6 +570,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
570 | 570 | del layer.w2_input_scale
|
571 | 571 | self.fused_experts_func = None
|
572 | 572 | else:
|
| 573 | + from vllm.model_executor.layers.fused_moe import fused_experts |
573 | 574 | self.fused_experts_func = fused_experts
|
574 | 575 |
|
575 | 576 | def apply(
|
@@ -826,6 +827,7 @@ def select_gemm_impl(
|
826 | 827 | prepare_finalize: FusedMoEPrepareAndFinalize,
|
827 | 828 | moe: FusedMoEConfig,
|
828 | 829 | ) -> FusedMoEPermuteExpertsUnpermute:
|
| 830 | + from vllm.model_executor.layers.fused_moe import CutlassExpertsFp8 |
829 | 831 |
|
830 | 832 | use_batched_format = (prepare_finalize.activation_format ==
|
831 | 833 | FusedMoEActivationFormat.BatchedExperts)
|
|
0 commit comments