lint

bnellnm · bnellnm · commit 3ca8322b7433 · 2025-05-28T23:40:56.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -205,13 +205,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
     block_shape = [16, 16, 32]  # 16 for k if not fp8
 
-    #print(f"tensors.A {tensors.A.shape}")
-    #print(f"tensors.B {tensors.B.shape}")
-
     if use_fp8_w8a8:
-        #A_scale = torch.ones((1, K), dtype=torch.float32, device=tensors.A.device)
-        #B_scale = torch.ones((N, K), dtype=torch.float32, device=tensors.A.device)
-        #quant_block_shape = [N, K]
         A_scale = torch.ones(1, dtype=torch.float32, device=tensors.A.device)
         B_scale = torch.ones(1, dtype=torch.float32, device=tensors.B.device)
         quant_block_shape = [1, 1]
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -63,6 +63,7 @@
     reason="Requires PPLX kernels",
 )
 
+
 @dataclasses.dataclass
 class ProcessGroupInfo:
     world_size: int
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -10,8 +10,7 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import (
     get_config_dtype_str, try_get_optimal_moe_config)
 from vllm.model_executor.layers.fused_moe.utils import (
-    _resize_cache,
-    moe_kernel_quantize_input)
+    _resize_cache, moe_kernel_quantize_input)
 
 
 @triton.jit
@@ -480,8 +479,7 @@ def prepare(
                         self.qtype,
                         self.per_act_token,
                         self.block_shape,
-                    )
-                )
+                    ))
             else:
                 b_a1[idx, :rows, :] = rhs
 
@@ -652,10 +650,8 @@ def batched_moe_kernel_quantize_input(
             if num_tokens > 0:
                 A_q[e, :num_tokens, :], tmp_scale = moe_kernel_quantize_input(
                     A[e, :num_tokens],
-                    A_scale[e, :num_tokens] if A_scale else None,
-                    qtype,
-                    per_channel_quant,
-                    [block_k, block_n])
+                    A_scale[e, :num_tokens] if A_scale else None, qtype,
+                    per_channel_quant, [block_k, block_n])
                 A_q_scale[e, :tmp_scale.shape[0]] = tmp_scale
 
         return A_q, A_q_scale
@@ -812,16 +808,8 @@ def apply(
                         intermediate_cache1.view(-1, N))
 
         qintermediate_cache2, a2q_scale = batched_moe_kernel_quantize_input(
-            intermediate_cache2,
-            a2_scale,
-            num_tokens,
-            E,
-            N,
-            expert_num_tokens,
-            self.qtype,
-            self.per_act_token,
-            self.block_shape
-        )
+            intermediate_cache2, a2_scale, num_tokens, E, N, expert_num_tokens,
+            self.qtype, self.per_act_token, self.block_shape)
 
         invoke_moe_batched_triton_kernel(A=qintermediate_cache2,
                                          B=w2,
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -769,13 +769,12 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w2_input_scale
 
     def select_gemm_impl(self, prepare_finalize):
-        from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
-            TritonOrDeepGemmExperts)
         from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-            BatchedPrepareAndFinalize,
-            BatchedTritonExperts)
+            BatchedPrepareAndFinalize, BatchedTritonExperts)
         from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
             PplxPrepareAndFinalize)
+        from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+            TritonOrDeepGemmExperts)
 
         assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
             "Marlin and ROCm AITER are not supported with all2all yet.")

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,7 @@`
`63`	`63`	`reason="Requires PPLX kernels",`
`64`	`64`	`)`
`65`	`65`
	`66`	`+`
`66`	`67`	`@dataclasses.dataclass`
`67`	`68`	`class ProcessGroupInfo:`
`68`	`69`	`world_size: int`