lint

bnellnm · bnellnm · commit efc014f1b48b · 2025-06-13T17:28:33.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -242,20 +242,32 @@ class FusedMoEConfig:
     max_num_tokens: int = MOE_DP_CHUNK_SIZE
 
     @property
-    def quant_dtype(self):
-        return self.quant_config.quant_dtype if self.quant_config is not None else None
+    def quant_dtype(self) -> Optional[torch.dtype]:
+        if self.quant_config is not None:
+            return self.quant_config.quant_dtype
+        else:
+            return None
 
     @property
-    def block_shape(self):
-        return self.quant_config.block_shape if self.quant_config is not None else None
+    def block_shape(self) -> Optional[list[int]]:
+         if self.quant_config is not None:
+             return self.quant_config.block_shape
+         else:
+             return None
 
     @property
-    def per_act_token_quant(self):
-        return self.quant_config.per_act_token_quant if self.quant_config is not None else False
+    def per_act_token_quant(self) -> bool:
+        if self.quant_config is not None:
+            return self.quant_config.per_act_token_quant
+        else:
+            return False
 
     @property
-    def per_out_ch_quant(self):
-        return self.quant_config.per_out_ch_quant if self.quant_config is not None else False
+    def per_out_ch_quant(self) -> bool:
+        if self.quant_config is not None:
+            return self.quant_config.per_out_ch_quant
+        else:
+            return False
 
     @property
     def tp_size(self):
@@ -338,7 +350,8 @@ def make(
                 quant_dtype = torch.float8_e4m3fn
 
             if weight_quant is not None:
-                per_out_ch_quant = weight_quant.strategy == QuantizationStrategy.CHANNEL
+                per_out_ch_quant = (
+                    weight_quant.strategy == QuantizationStrategy.CHANNEL)
 
             assert quant_dtype is not None
 
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -82,6 +82,7 @@ def _do_quant(
 
         assert isinstance(x, torch.Tensor)
 
+        # TODO (bnell):
         # Check if there is a block_shape / or if we can infer the quantization
         # schemes from the scales.
         # per_token_quant = None
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -303,7 +303,8 @@ def batched_triton_kernel(
         if (group_k > 0 and group_n > 0) or per_channel_quant:
             a_scale_ptr = a_scale_ptr + (expert_id *
                                          stride_ase) + cta_m_start * stride_asm
-            #b_scale_ptr = b_scale_ptr + (expert_id * stride_bse) # + cta_n_start * stride_bsn?
+            #b_scale_ptr = b_scale_ptr + (expert_id * stride_bse)
+            # (?) b_scale_ptr = b_scale_ptr + cta_n_start * stride_bsn
         # channel-wise or tensor-wise
         else:
             a_scale_ptr = a_scale_ptr + (expert_id * stride_ase)
@@ -379,9 +380,10 @@ def invoke_moe_batched_triton_kernel(
     grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) *
             triton.cdiv(B.size(1), BLOCK_N))
 
-    assert A_scale is None or A_scale.ndim == 3, f"{0 if A_scale is None else A_scale.shape}"
-    assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, f"{0 if B_scale is None else B_scale.shape}"
-    #assert B_scale is None or B_scale.ndim == 3, f"{0 if B_scale is None else (A.shape, B_scale.shape)}"
+    assert A_scale is None or A_scale.ndim == 3, (
+        f"{0 if A_scale is None else A_scale.shape}")
+    assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, (
+        f"{0 if B_scale is None else B_scale.shape}")
 
     if B_scale is not None:
         if B_scale.ndim == 1:
@@ -522,7 +524,10 @@ def prepare(
                 k_tiles = (hidden_dim + block_k - 1) // block_k
                 scale_shape = (num_local_experts, self.max_num_tokens, k_tiles)
             else:
-                num = self.max_num_tokens if quant_config.per_act_token_quant else 1
+                if quant_config.per_act_token_quant:
+                    num = self.max_num_tokens
+                else:
+                    num = 1
                 scale_shape = (num_local_experts, num, 1)
 
             #print(f"SCALE_SHAPE {block_shape} {b_a1.shape} {scale_shape}")
@@ -555,7 +560,8 @@ def prepare(
                     quant_config.per_act_token_quant,
                     quant_config.block_shape,
                 ))
-                if quant_config.block_shape is None and not quant_config.per_act_token_quant:
+                if (quant_config.block_shape is None and
+                    not quant_config.per_act_token_quant):
                     b_a1_scale[idx] = b_s
                 else:
                     #print(f"XXXXX rhs={rhs.shape} b_s={b_s.shape}")
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -556,8 +556,9 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
 
     if use_fp8_w8a8 or use_int8_w8a8:
         assert B_scale is not None
-        #        assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0])
-        #                == B_scale.shape[-2]), f"{block_shape[0]} {B.shape[-2]} {B_scale.shape[-2]}"
+        assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0])
+                == B_scale.shape[-2]), (
+                    f"{block_shape[0]} {B.shape[-2]} {B_scale.shape[-2]}")
         assert (block_shape is None or triton.cdiv(B.shape[-1], block_shape[1])
                 == B_scale.shape[-1])
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -20,20 +20,18 @@
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig, FusedMoEParallelConfig)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel, FusedMoEPermuteExpertsUnpermute,
+    FusedMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig, QuantizeMethodBase)
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel,
-    FusedMoEPermuteExpertsUnpermute,
-    FusedMoEPrepareAndFinalize)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.platforms.interface import CpuArchEnum
 from vllm.utils import direct_register_custom_op
 
-
 has_pplx = importlib.util.find_spec("pplx_kernels") is not None
 has_deepep = importlib.util.find_spec("deep_ep") is not None
 
@@ -95,9 +93,9 @@ def init_prepare_finalize(self, moe: FusedMoEConfig,
                 block_shape=moe.block_shape,
             )
 
-            logger.debug(
-                f"All2All {moe.quant_dtype}, {moe.block_shape} = {hidden_dim_bytes}/{hidden_scale_bytes}"
-            )
+            logger.debug("All2All %s, %s = %s/%s", moe.quant_dtype,
+                         moe.block_shape, hidden_dim_bytes,
+                         hidden_scale_bytes)
 
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,