lint

bnellnm · bnellnm · commit 23f26c992301 · 2025-06-16T22:39:36.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
@@ -5,14 +5,14 @@
 Run `pytest tests/kernels/test_moe.py`.
 """
 import functools
+from typing import Callable, Optional, Union
+
 import pytest
 import torch
-
 from torch.nn import Parameter
 from torch.nn import functional as F
 from transformers import MixtralConfig
 from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
-from typing import Callable, Optional, Union
 
 import vllm.model_executor.layers.fused_moe  # noqa
 from tests.kernels.utils import opcheck, stack_and_dev, torch_moe
@@ -56,13 +56,19 @@ def run_moe_test(
     padding: bool = False,
     use_compile: bool = False,
     use_cudagraph: bool = False,
-    atol:float=2e-2,
-    rtol:float=0,
+    atol: float = 2e-2,
+    rtol: float = 0,
 ) -> torch.Tensor:
     if isinstance(baseline, torch.Tensor):
         baseline_output = baseline
     else:
-        baseline_output = baseline(a, w1, w2, score, topk, global_num_experts=global_num_experts, expert_map=expert_map)
+        baseline_output = baseline(a,
+                                   w1,
+                                   w2,
+                                   score,
+                                   topk,
+                                   global_num_experts=global_num_experts,
+                                   expert_map=expert_map)
 
     # Pad the weight if moe padding is enabled
     if padding:
@@ -96,7 +102,10 @@ def run_moe_test(
         graph.replay()
         torch.cuda.synchronize()
 
-    torch.testing.assert_close(test_output, baseline_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(test_output,
+                               baseline_output,
+                               atol=atol,
+                               rtol=rtol)
 
     return baseline_output
 
@@ -167,7 +176,7 @@ def m_fused_moe(
         score: torch.Tensor,
         topk: int,
         global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor]= None,
+        expert_map: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
         return m_fused_moe_fn(a,
@@ -195,13 +204,20 @@ def m_fused_moe(
         padding=padding,
     )
 
-    use_compile = m >= chunk_size and n >= 1024 and k >= 1024 and current_platform.is_cuda_alike()
+    use_compile = (m >= chunk_size and n >= 1024 and k >= 1024
+                   and current_platform.is_cuda_alike())
     use_cudagraph = use_compile
 
     with set_current_vllm_config(vllm_config):
         baseline_output = runner(torch_moe, iterative_moe)
-        runner(baseline_output, fused_moe_fn, use_compile=use_compile, use_cudagraph=use_cudagraph)
-        runner(baseline_output, m_fused_moe, use_compile=use_compile, use_cudagraph=use_cudagraph)
+        runner(baseline_output,
+               fused_moe_fn,
+               use_compile=use_compile,
+               use_cudagraph=use_cudagraph)
+        runner(baseline_output,
+               m_fused_moe,
+               use_compile=use_compile,
+               use_cudagraph=use_cudagraph)
 
 
 @pytest.mark.parametrize("m", [1, 32, 222])
@@ -311,7 +327,12 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
                                   w1_zp=w1_qzeros if has_zp else None,
                                   w2_zp=w2_qzeros if has_zp else None,
                                   block_shape=[0, group_size])
-        torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, expert_map=e_map)
+        torch_output = torch_moe(a,
+                                 w1_ref,
+                                 w2_ref,
+                                 score,
+                                 topk,
+                                 expert_map=e_map)
 
     torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
@@ -619,7 +640,12 @@ def test_fused_marlin_moe(
     topk_weights, topk_ids, _ = fused_topk(a, score, topk, False)
 
     with set_current_vllm_config(vllm_config):
-        torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, expert_map=e_map)
+        torch_output = torch_moe(a,
+                                 w_ref1,
+                                 w_ref2,
+                                 score,
+                                 topk,
+                                 expert_map=e_map)
 
     marlin_output = torch.ops.vllm.fused_marlin_moe(
         a,
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -6,17 +6,15 @@
 import pytest
 import torch
 
+from tests.kernels.utils import torch_experts
 from vllm import _custom_ops as ops
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 
-from tests.kernels.utils import torch_experts
-
 from .deepep_utils import ProcessGroupInfo, parallel_launch
 
 try:
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -18,8 +18,8 @@
 except ImportError:
     has_pplx = False
 
+from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import override_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedExperts, BatchedPrepareAndFinalize, BatchedTritonExperts)
@@ -29,8 +29,6 @@
     FusedMoEModularKernel)
 from vllm.platforms import current_platform
 
-from tests.kernels.utils import torch_experts
-
 from .deepep_utils import ProcessGroupInfo, parallel_launch
 
 requires_pplx = pytest.mark.skipif(
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
@@ -403,7 +403,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
     itertools.product(M_moe_dg, N_moe, K_moe, E, TOP_KS, SEEDS))
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
 @torch.inference_mode()
-def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch):
+def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
+                                            monkeypatch):
     if topk > E:
         pytest.skip(f"Skipping test: topk={topk} > E={E}")
 
@@ -455,7 +456,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch)
         w1[i], w1_s[i] = per_block_cast_to_fp8(w1_bf16[i])
         w2[i], w2_s[i] = per_block_cast_to_fp8(w2_bf16[i])
 
-    use_compile = M > chunk_size and N >= 1024 and K >= 1024 and current_platform.is_cuda_alike()
+    use_compile = (chunk_size < M and N >= 1024 and K >= 1024
+                   and current_platform.is_cuda_alike())
     use_cudagraph = use_compile
 
     # Set the context to avoid lots of warning spam.
@@ -477,14 +479,16 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch)
         else:
             deep_gemm_moe_fp8_fn = deep_gemm_moe_fp8
 
-        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+        out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                   topk_ids)
 
         if use_cudagraph:
             out.fill_(0)
             stream = torch.cuda.Stream()
             graph = torch.cuda.CUDAGraph()
             with torch.cuda.graph(graph, stream=stream):
-                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights, topk_ids)
+                out = deep_gemm_moe_fp8_fn(a, w1, w2, w1_s, w2_s, topk_weights,
+                                           topk_ids)
             torch.cuda.synchronize()
             graph.replay()
             torch.cuda.synchronize()
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -1054,18 +1054,16 @@ def compute_max_diff(output, output_ref):
         torch.abs(output_ref))
 
 
-def torch_experts(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weight: torch.Tensor,
-    topk_ids: torch.Tensor,
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None
-) -> torch.Tensor:
-    assert (global_num_experts == -1 or
-            (global_num_experts == w1.shape[0] and expert_map is None) or
-            global_num_experts == expert_map.shape[0])
+def torch_experts(a: torch.Tensor,
+                  w1: torch.Tensor,
+                  w2: torch.Tensor,
+                  topk_weight: torch.Tensor,
+                  topk_ids: torch.Tensor,
+                  global_num_experts: int = -1,
+                  expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
+    assert (global_num_experts == -1
+            or (global_num_experts == w1.shape[0] and expert_map is None)
+            or global_num_experts == expert_map.shape[0])
     topk = topk_ids.shape[1]
     B, D = a.shape
     a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
@@ -1083,18 +1081,17 @@ def torch_experts(
             topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
 
 
-def torch_moe(
-    a: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    score: torch.Tensor,
-    topk: int,
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None
-) -> torch.Tensor:
+def torch_moe(a: torch.Tensor,
+              w1: torch.Tensor,
+              w2: torch.Tensor,
+              score: torch.Tensor,
+              topk: int,
+              global_num_experts: int = -1,
+              expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
     score = torch.softmax(score, dim=-1, dtype=torch.float32)
     topk_weight, topk_ids = torch.topk(score, topk)
-    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts, expert_map)
+    return torch_experts(a, w1, w2, topk_weight, topk_ids, global_num_experts,
+                         expert_map)
 
 
 def torch_moe_single(a, w, score, topk):
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -7,8 +7,6 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-import vllm.model_executor.layers.quantization.deepgemm
-
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
     _moe_permute)
diff --git a/vllm/model_executor/layers/quantization/deepgemm.py b/vllm/model_executor/layers/quantization/deepgemm.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import importlib.util
 import logging
+from typing import Optional
 
 import torch
 
-from typing import Optional
-
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils import direct_register_custom_op
@@ -86,7 +85,8 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm(
     expert_ids: torch.Tensor,
 ) -> None:
     import deep_gemm as dg
-    dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a, a_scale), (b, b_scale), output, expert_ids)
+    dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous((a, a_scale), (b, b_scale),
+                                                 output, expert_ids)
 
 
 direct_register_custom_op(
@@ -97,7 +97,6 @@ def m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm(
     dispatch_key=current_platform.dispatch_key,
 )
 
-
 direct_register_custom_op(
     op_name="m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm",
     op_func=m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm,