Skip to content

Commit 96d3d02

Browse files
committed
lint
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent 8d17616 commit 96d3d02

File tree

6 files changed

+16
-20
lines changed

6 files changed

+16
-20
lines changed

tests/kernels/moe/test_batched_moe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
import triton.language as tl
1010

1111
from tests.kernels.moe.utils import (batched_moe, make_test_weights,
12-
torch_moe2, triton_moe,
13-
per_block_cast_to_fp8)
12+
per_block_cast_to_fp8, torch_moe2,
13+
triton_moe)
1414
from tests.kernels.quant_utils import native_w8a8_block_matmul
1515
from vllm.config import VllmConfig, set_current_vllm_config
1616
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (

tests/kernels/moe/test_pplx_moe.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,12 @@
2323
torch_moe2)
2424
from tests.pplx_utils import ProcessGroupInfo, parallel_launch
2525
from vllm.config import VllmConfig, set_current_vllm_config
26-
from vllm.model_executor.layers.fused_moe import (override_config,
26+
from vllm.model_executor.layers.fused_moe import (BatchedTritonExperts,
2727
FusedMoEConfig,
28+
FusedMoEModularKernel,
2829
fused_topk,
2930
get_default_config,
30-
FusedMoEModularKernel,
31-
BatchedTritonExperts,
32-
FusedMoEModularKernel)
31+
override_config)
3332
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
3433
BatchedPrepareAndFinalize, NaiveBatchedExperts)
3534
from vllm.platforms import current_platform
@@ -561,8 +560,6 @@ def _pplx_moe(
561560

562561
moe_config = get_default_config(m, e, n, k, topk, a.dtype, False)
563562

564-
use_fp8_w8a8 = qtype == torch.float8_e4m3fn
565-
566563
device = torch.device("cuda", pgi.rank)
567564
a = a.to(device)
568565
w1 = w1.to(device)

tests/kernels/moe/utils.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,7 @@ def Xnative_w8a8_block_matmul(A: torch.Tensor,
3333
`Bs` (float32).
3434
The output is returned in the specified `output_dtype`.
3535
"""
36-
if A.dtype.itemsize <= 2:
37-
compute_type = torch.bfloat16
38-
else:
39-
compute_type = torch.float32
36+
compute_type = torch.bfloat16 if A.dtype.itemsize <= 2 else torch.float32
4037

4138
A = A.to(compute_type)
4239
B = B.to(compute_type).contiguous()
@@ -101,7 +98,7 @@ def torch_moe2(
10198
block_shape: Optional[list[int]] = None,
10299
) -> torch.Tensor:
103100
M, K = a.shape
104-
N = w1.shape[1]
101+
#N = w1.shape[1]
105102
topk = topk_ids.shape[1]
106103

107104
a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K)

vllm/model_executor/layers/fused_moe/config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import vllm.envs as envs
1212
from vllm.config import ParallelConfig
13-
from vllm.distributed import get_dp_group
13+
from vllm.distributed import get_dp_group, get_tensor_model_parallel_rank
1414
from vllm.model_executor.layers.quantization.base_config import (
1515
QuantizationConfig)
1616

vllm/model_executor/layers/fused_moe/fused_moe.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import vllm.model_executor.layers.fused_moe.modular_kernel as mk
1313
from vllm import _custom_ops as ops
1414
from vllm.logger import init_logger
15-
from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
15+
from vllm.model_executor.layers.fused_moe.config import (
16+
FusedMoEQuantConfig, get_config_quant_dtype)
1617
from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
1718
_valid_deep_gemm, deep_gemm_moe_fp8)
1819
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
@@ -475,7 +476,7 @@ def prepare_scales(
475476
from vllm.utils import round_up
476477
max_num_tokens = round_up(a1.shape[0], 64)
477478
num_tokens, hidden_dim = a1.size()
478-
topk = topk_ids.size(1)
479+
#topk = topk_ids.size(1)
479480

480481
tokens_per_expert = torch.zeros(num_experts,
481482
dtype=torch.int,

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,19 @@
2424
is_rocm_aiter_moe_enabled)
2525
from vllm.model_executor.layers.quantization.base_config import (
2626
QuantizationConfig, QuantizeMethodBase)
27+
from vllm.model_executor.layers.fused_moe.modular_kernel import (
28+
FusedMoEModularKernel,
29+
FusedMoEPermuteExpertsUnpermute,
30+
FusedMoEPrepareAndFinalize)
2731
from vllm.model_executor.utils import set_weight_attrs
2832
from vllm.platforms import current_platform
2933
from vllm.platforms.interface import CpuArchEnum
3034
from vllm.utils import direct_register_custom_op
3135

36+
3237
has_pplx = importlib.util.find_spec("pplx_kernels") is not None
3338
has_deepep = importlib.util.find_spec("deep_ep") is not None
3439

35-
from .modular_kernel import (FusedMoEModularKernel,
36-
FusedMoEPermuteExpertsUnpermute,
37-
FusedMoEPrepareAndFinalize)
38-
3940
if current_platform.is_cuda_alike():
4041
from .fused_batched_moe import BatchedTritonExperts
4142
from .fused_moe import TritonExperts, fused_experts

0 commit comments

Comments
 (0)