fix merge

bnellnm · bnellnm · commit 6b4e4060e96f · 2025-06-18T21:23:48.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -4,6 +4,7 @@
 from contextlib import contextmanager
 from typing import Any, Optional
 
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
 from vllm.model_executor.layers.fused_moe.layer import (
     FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
@@ -29,12 +30,12 @@ def get_config() -> Optional[dict[str, Any]]:
 
 __all__ = [
     "FusedMoE",
+    "FusedMoEConfig",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
     "FusedMoEPermuteExpertsUnpermute",
     "FusedMoEActivationFormat",
     "FusedMoEPrepareAndFinalize",
-    "MoEConfig",
     "override_config",
     "get_config",
 ]
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -40,6 +40,7 @@ def __init__(self,
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
+        self.allow_deep_gemm = allow_deep_gemm
 
         # BatchedTritonKernel doesn't support block quantization
         # at the moment.
@@ -56,7 +57,7 @@ def __init__(self,
         ) if self.block_shape is None else None
 
         is_fp8_128_block_quantized = (
-            self.use_fp8_w8a8 and self.block_shape
+            use_fp8_w8a8 and self.block_shape
             == BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE)
 
         self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -15,8 +15,6 @@
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache, per_token_group_quant_fp8)
-from vllm.model_executor.layers.quantization.deepgemm import (  # noqa: E501
-    m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm as m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm)
 from vllm.utils import round_up
 
 logger = init_logger(__name__)