[Bugfix] Lazy import fused_experts in BitsAndBytesMoEMethod to avoid break not-cuda-alike devices (vllm-project#20822)

bigPYJ1151 · Chen-zexi · commit 49dd29003ee0 · 2025-07-13T02:50:25.000-04:00
Signed-off-by: jiang1.li &lt;jiang1.li@intel.com&gt;
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -5,7 +5,6 @@
 
 import torch
 
-from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
@@ -467,6 +466,7 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        from vllm.model_executor.layers.fused_moe import fused_experts
 
         if enable_eplb:
             raise NotImplementedError(