more lint

bnellnm · bnellnm · commit 12e42ea188c3 · 2025-06-18T20:28:47.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -8,11 +8,11 @@
 import torch
 import triton.language as tl
 
-from tests.kernels.utils import torch_experts
 from tests.kernels.moe.utils import (batched_moe,
                                      make_quantized_test_activations,
                                      make_test_weights, triton_moe)
 from tests.kernels.quant_utils import native_w8a8_block_matmul
+from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     invoke_moe_batched_triton_kernel)
@@ -265,11 +265,17 @@ def test_fused_moe_batched_experts(
         batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
                                      w2_s, quant_dtype, per_act_token_quant,
                                      block_shape)
-        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids,
-                                        w1_scale=w1_s, w2_scale=w2_s,
-                                        quant_dtype=quant_dtype,
-                                        per_act_token_quant=per_act_token_quant,
-                                        block_shape=block_shape)
+        baseline_output = torch_experts(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape)
         triton_output = triton_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
                                    w2_s, quant_dtype, per_act_token_quant,
                                    block_shape)
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -577,8 +577,13 @@ def _pplx_moe(
 
     with set_current_vllm_config(vllm_config), override_config(moe_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_experts(a, w1, w2, topk_weight, topk_ids,
-                                     w1_scale=w1_s, w2_scale=w2_s,
+        torch_output = torch_experts(a,
+                                     w1,
+                                     w2,
+                                     topk_weight,
+                                     topk_ids,
+                                     w1_scale=w1_s,
+                                     w2_scale=w2_s,
                                      quant_dtype=qtype,
                                      per_act_token_quant=per_act_token_quant,
                                      block_shape=block_shape)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
@@ -4,14 +4,11 @@
 
 import torch
 
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
     FusedMoEModularKernel)
-from vllm.model_executor.layers.fused_moe.utils import (
-    moe_kernel_quantize_input)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     per_token_group_quant_fp8)
 from vllm.utils import round_up
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -14,10 +14,10 @@
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-
 from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input
+from vllm.model_executor.layers.fused_moe.utils import (
+    moe_kernel_quantize_input)
 from vllm.platforms.interface import _Backend
 from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
                         STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
@@ -1081,10 +1081,7 @@ def torch_experts(
 
     a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)
 
-    out = torch.zeros(M * topk,
-                      w2.shape[1],
-                      dtype=a.dtype,
-                      device=a.device)
+    out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device)
 
     a, a_scale = moe_kernel_quantize_input(a, None, quant_dtype,
                                            per_act_token_quant, block_shape)
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -45,7 +45,7 @@ def get_config() -> Optional[dict[str, Any]]:
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
     from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
         BatchedDeepGemmExperts)
-    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
+    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (  # noqa: E501
         BatchedTritonOrDeepGemmExperts)
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
         CutlassExpertsFp8, cutlass_moe_fp4, cutlass_moe_fp8)
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -74,9 +74,9 @@ def activation_formats(
         self
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
         if self.batched_triton_experts is not None:
-            assert (self.batched_deep_gemm_experts is None or
-                    self.batched_deep_gemm_experts.activation_formats ==
-                    self.batched_triton_experts.activation_formats)
+            assert (self.batched_deep_gemm_experts is None
+                    or self.batched_deep_gemm_experts.activation_formats
+                    == self.batched_triton_experts.activation_formats)
             return self.batched_triton_experts.activation_formats
         else:
             assert self.batched_deep_gemm_experts is not None
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -10,8 +10,8 @@
 
 import vllm.envs as envs
 from vllm.config import ParallelConfig
-from vllm.logger import init_logger
 from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
+from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -6,7 +6,6 @@
 
 import torch
 
-import vllm.model_executor.layers.quantization.deepgemm
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -16,6 +15,9 @@
     MoEPrepareAndFinalizeNoEP)
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache, per_token_group_quant_fp8)
+from vllm.model_executor.layers.quantization.deepgemm import (
+    m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm as
+    m_grouped_gemm_fp8_fp8_bf16_nt_contiguous_deepgemm)
 from vllm.utils import round_up
 
 logger = init_logger(__name__)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -230,7 +230,7 @@ def select_gemm_impl(
         assert all2all_manager is not None
 
         if (prepare_finalize.activation_format ==
-            FusedMoEActivationFormat.BatchedExperts):
+                FusedMoEActivationFormat.BatchedExperts):
             logger.debug("BatchedTritonExperts %s", self.moe)
             assert self.moe.dp_size == all2all_manager.dp_world_size
             return BatchedTritonExperts(
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -49,9 +49,9 @@ def __init__(
     def activation_formats(
         self
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        assert (self.deep_gemm_expert is None or
-                self.triton_expert.activation_formats ==
-                self.deep_gemm_expert.activation_formats)
+        assert (self.deep_gemm_expert is None
+                or self.triton_expert.activation_formats
+                == self.deep_gemm_expert.activation_formats)
         return self.triton_expert.activation_formats
 
     def supports_chunking(self) -> bool:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -553,7 +553,7 @@ def select_gemm_impl(
     ) -> FusedMoEPermuteExpertsUnpermute:
 
         if (prepare_finalize.activation_format ==
-            FusedMoEActivationFormat.BatchedExperts):
+                FusedMoEActivationFormat.BatchedExperts):
             # TODO(bnell): attrs from prepare_finalize sketchy
             max_experts_per_worker = (
                 (moe.num_experts + prepare_finalize.world_size - 1) //
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -782,10 +782,9 @@ def select_gemm_impl(
             "Marlin and ROCm AITER are not supported with all2all yet.")
 
         if (prepare_finalize.activation_format ==
-            FusedMoEActivationFormat.BatchedExperts):
+                FusedMoEActivationFormat.BatchedExperts):
             max_num_tokens_per_rank = (
-                prepare_finalize.max_num_tokens_per_rank()
-            )
+                prepare_finalize.max_num_tokens_per_rank())
             assert max_num_tokens_per_rank is not None
             logger.debug(
                 "BatchedTritonOrDeepGemmExperts(%s): "