fixes

bnellnm · bnellnm · commit 72195594d87e · 2025-06-27T03:10:17.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -90,9 +90,14 @@ def is_grouped(self) -> bool:
     def is_per_tensor(self) -> bool:
         return not self.per_act_token_quant and self.block_shape is None
 
-    def scale_shape(self, max_tokens: int, hidden_dim: int) -> Optional[tuple[int, int]]:
+    def scale_shape(
+        self,
+        max_tokens: int,
+        hidden_dim: int,
+    ) -> Optional[tuple[int, int]]:
         if self.is_quantized:
             if self.is_grouped:
+                assert self.block_shape is not None
                 _, block_k = self.block_shape
                 k_tiles = cdiv(hidden_dim, block_k)
                 return (max_tokens, k_tiles)
@@ -107,10 +112,11 @@ def batched_scale_shape(
         self,
         num_experts: int,
         max_tokens: int,
-        hidden_dim: int
+        hidden_dim: int,
     ) -> Optional[tuple[int, int, int]]:
         if self.is_quantized:
             scale_shape = self.scale_shape(max_tokens, hidden_dim)
+            assert scale_shape is not None
             return (num_experts, *scale_shape)
         else:
             return None
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -308,13 +308,17 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         else:
             self.fused_experts_func = fused_experts
 
-    def select_gemm_impl(self, prepare_finalize):
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: FusedMoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
             BatchedTritonExperts)
 
         assert not self.rocm_aiter_moe_enabled and not self.use_marlin
 
-        logger.debug("BatchedTritonExperts(%s)", self.__classname__.__name__)
+        logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
 
         use_batched_format = (prepare_finalize.activation_format ==
                               FusedMoEActivationFormat.BatchedExperts)
@@ -595,7 +599,7 @@ def select_gemm_impl(
         num_experts = (moe.num_local_experts
                        if use_batched_format else moe.num_experts)
 
-        logger.debug("CutlassExpertsFp8(%s)", self.__classname__.__name__)
+        logger.debug("CutlassExpertsFp8(%s)", self.__class__.__name__)
 
         experts = CutlassExpertsFp8(
             num_experts,