vllm-project
diff --git a/‎requirements/test.txt
Lines changed: 20 additions & 2 deletions b/‎requirements/test.txt
Lines changed: 20 additions & 2 deletions
diff --git a/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 18 additions & 14 deletions b/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 18 additions & 14 deletions
diff --git a/‎tests/kernels/moe/test_block_fp8.py
Lines changed: 5 additions & 6 deletions b/‎tests/kernels/moe/test_block_fp8.py
Lines changed: 5 additions & 6 deletions
diff --git a/‎tests/kernels/moe/test_block_int8.py
Lines changed: 2 additions & 2 deletions b/‎tests/kernels/moe/test_block_int8.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/kernels/moe/test_deepep_deepgemm_moe.py
Lines changed: 1 addition & 3 deletions b/‎tests/kernels/moe/test_deepep_deepgemm_moe.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 19 additions & 15 deletions b/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 19 additions & 15 deletions
@@ -31,6 +31,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -141,6 +145,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -690,7 +699,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -753,8 +761,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.0+cu128
@@ -828,13 +841,18 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
+    #   rich
     #   torch
     #   typer
     #   typing-inspection
 
@@ -8,12 +8,10 @@
 import torch
 import triton.language as tl
 
-from tests.kernels.moe.utils import (
-    batched_moe,
-    make_test_weights,
-    make_quantized_test_activations,
-    torch_moe2,
-    triton_moe)
+from tests.kernels.utils import torch_experts
+from tests.kernels.moe.utils import (batched_moe,
+                                     make_quantized_test_activations,
+                                     make_test_weights, triton_moe)
 from tests.kernels.quant_utils import native_w8a8_block_matmul
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
@@ -109,11 +107,13 @@ def ref_impl(
                          [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
 @pytest.mark.parametrize("N", [128, 256, 512, 1024])
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None])
 @pytest.mark.parametrize("per_act_token_quant", [False])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
-                    N: int, dtype: torch.dtype, block_shape: Optional[list[int]],
+                    N: int, dtype: torch.dtype,
+                    block_shape: Optional[list[int]],
                     per_act_token_quant: bool):
     current_platform.seed_everything(7)
 
@@ -144,8 +144,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         in_dtype=act_dtype,
         quant_dtype=quant_dtype,
         block_shape=block_shape,
-        per_act_token_quant=per_act_token_quant
-    )
+        per_act_token_quant=per_act_token_quant)
 
     B, B_q, B_scale, _, _, _ = make_test_weights(
         num_experts,
@@ -252,7 +251,10 @@ def test_fused_moe_batched_experts(
         act_dtype = dtype
         quant_dtype = None
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e, n, k, block_shape=block_shape,
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+                                                 n,
+                                                 k,
+                                                 block_shape=block_shape,
                                                  in_dtype=act_dtype,
                                                  quant_dtype=quant_dtype)
 
@@ -263,9 +265,11 @@ def test_fused_moe_batched_experts(
         batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
                                      w2_s, quant_dtype, per_act_token_quant,
                                      block_shape)
-        baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids, w1_s,
-                                     w2_s, quant_dtype, per_act_token_quant,
-                                     block_shape)
+        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids,
+                                        w1_scale=w1_s, w2_scale=w2_s,
+                                        quant_dtype=quant_dtype,
+                                        per_act_token_quant=per_act_token_quant,
+                                        block_shape=block_shape)
         triton_output = triton_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
                                    w2_s, quant_dtype, per_act_token_quant,
                                    block_shape)
 
@@ -7,8 +7,8 @@
 import pytest
 import torch
 
-from tests.kernels.quant_utils import (native_w8a8_block_matmul,
-                                       native_per_token_group_quant_fp8,
+from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
+                                       native_w8a8_block_matmul,
                                        per_block_cast_to_fp8)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -20,7 +20,7 @@
 from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
     moe_align_block_size)
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+    per_token_group_quant_fp8)
 from vllm.platforms import current_platform
 
 dg_available = False
@@ -261,9 +261,8 @@ def deep_gemm_w8a8_block_fp8_moe(M, K, a, w1, w2, w1_s, w2_s, score, topk,
     return final_out
 
 
-@pytest.mark.parametrize(
-    "M,N,K,E,topk,seed",
-    itertools.product(M_dg, N, K, E, TOP_KS, SEEDS))
+@pytest.mark.parametrize("M,N,K,E,topk,seed",
+                         itertools.product(M_dg, N, K, E, TOP_KS, SEEDS))
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
 
@@ -7,8 +7,8 @@
 import pytest
 import torch
 
-from tests.kernels.quant_utils import (native_w8a8_block_matmul,
-                                       native_per_token_group_quant_int8)
+from tests.kernels.quant_utils import (native_per_token_group_quant_int8,
+                                       native_w8a8_block_matmul)
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_moe
 
@@ -28,7 +28,6 @@
 has_deep_ep = importlib.util.find_spec("deep_ep") is not None
 has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None
 
-
 if has_deep_ep:
     from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
         DeepEPHTPrepareAndFinalize)
@@ -69,8 +68,7 @@ def per_block_cast_to_fp8(
     assert x.dim() == 2
     m, n = x.shape
     x_padded = torch.zeros(
-        (cdiv(m, 128) * 128,
-         cdiv(n, block_size_n) * block_size_n),
+        (cdiv(m, 128) * 128, cdiv(n, block_size_n) * block_size_n),
         dtype=x.dtype,
         device=x.device)
     x_padded[:m, :n] = x
 
@@ -18,18 +18,16 @@
 except ImportError:
     has_pplx = False
 
+from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
 from tests.kernels.utils import torch_experts
-from tests.kernels.moe.utils import (make_test_weights, naive_batched_moe)
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe import (
-    override_config,
-    fused_topk)
-from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
+from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
-from vllm.model_executor.layers.fused_moe.modular_kernel import (
-    FusedMoEModularKernel)
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
     BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
+from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
@@ -579,11 +577,14 @@ def _pplx_moe(
 
     with set_current_vllm_config(vllm_config), override_config(moe_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        torch_output = torch_experts(a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
-                                     qtype, per_act_token_quant, block_shape)
-        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size, a,
-                               w1, w2, topk_weight, topk_ids, w1_s, w2_s, qtype,
-                               per_act_token_quant, block_shape)
+        torch_output = torch_experts(a, w1, w2, topk_weight, topk_ids,
+                                     w1_scale=w1_s, w2_scale=w2_s,
+                                     quant_dtype=qtype,
+                                     per_act_token_quant=per_act_token_quant,
+                                     block_shape=block_shape)
+        pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
+                               a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
+                               qtype, per_act_token_quant, block_shape)
         # TODO (bnell): fix + re-enable
         #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
         #                              topk_ids)
@@ -601,7 +602,7 @@ def _pplx_moe(
 @pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16]) # torch.float8_e4m3fn, 
+@pytest.mark.parametrize("dtype", [torch.bfloat16])  # torch.float8_e4m3fn,
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
@@ -634,8 +635,11 @@ def test_pplx_moe(
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(
-        e, n, k, quant_dtype=quant_dtype, block_shape=block_shape)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
+                                                 n,
+                                                 k,
+                                                 quant_dtype=quant_dtype,
+                                                 block_shape=block_shape)
 
     parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
                     w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,