From 270d05d9fdf9fc68767056204a1fee078358b122 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Sat, 12 Jul 2025 19:52:14 +0000
Subject: [PATCH 1/5] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 benchmarks/kernels/benchmark_moe.py                  | 12 +++++++++++-
 tools/ep_kernels/install_python_libraries.sh         |  6 +++---
 .../layers/fused_moe/pplx_prepare_finalize.py        |  5 ++---
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 51c9f68e43a..972d8a0ba6f 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -595,6 +595,13 @@ def main(args: argparse.Namespace):
         intermediate_size = config.intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
 
+    # Expert parallelism 
+    if E % args.ep_size != 0:
+        raise ValueError(
+            f"Number of experts {E} must be divisible by expert parallel size {args.ep_size}"
+        )
+    E = E // args.ep_size
+
     hidden_size = config.hidden_size
     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
     use_fp8_w8a8 = args.dtype == "fp8_w8a8"
@@ -724,7 +731,10 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]:
         "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1"
     )
     parser.add_argument(
-        "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
+        "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=1
+    )
+    parser.add_argument(
+        "--ep-size", "-ep", "--expert-parallel-size", type=int, default=1
     )
     parser.add_argument(
         "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh
index 83643c084bf..a1d7f547d54 100644
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@@ -11,7 +11,7 @@ if [ ! -d "$WORKSPACE" ]; then
 fi
 
 # install dependencies if not installed
-pip3 install cmake torch ninja
+uv pip install cmake torch ninja
 
 # build nvshmem
 pushd $WORKSPACE
@@ -59,7 +59,7 @@ git clone https://github.com/ppl-ai/pplx-kernels
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  .
+PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX uv pip install -vvv -e  .
 popd
 
 # build and install deepep, require pytorch installed
@@ -67,5 +67,5 @@ pushd $WORKSPACE
 git clone https://github.com/deepseek-ai/DeepEP
 cd DeepEP
 export NVSHMEM_DIR=$WORKSPACE/nvshmem_install
-PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
+PIP_NO_BUILD_ISOLATION=0 uv pip install -vvv -e  .
 popd
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 4cd68608f02..b5f60193d7e 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -197,14 +197,13 @@ def prepare(
         # This argument is optional, defaults to indices.size(0)
         # There's not much point setting this unless it is != indices.size(0)
         bound_m: Optional[torch.Tensor] = None
-
         self.a2a.dispatch(
             out_expert_num_tokens=expert_num_tokens,
             out_expert_x=expert_x,
             out_expert_x_scale=expert_x_scale,
             dp_x=a1q,
             dp_x_scale=a1q_scale,
-            indices=topk_ids,
+            indices=topk_ids.view(dtype=torch.uint32),
             bound_m=bound_m,
         )
 
@@ -249,7 +248,7 @@ def finalize(
             topk_weights = torch.ones_like(topk_weights)
 
         self.a2a.combine(out_tokens=output,
-                         indices=topk_ids,
+                         indices=topk_ids.view(dtype=torch.uint32),
                          weights=topk_weights,
                          expert_y=fused_expert_output,
                          bound_m=bound_m)

From 8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Sat, 12 Jul 2025 19:56:56 +0000
Subject: [PATCH 2/5] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 Dockerfile | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000000..beb001ca2f5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,12 @@
+ARG CUDA_VERSION=12.8.1
+from nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+
+RUN wget -qO- https://astral.sh/uv/install.sh | sh
+ 
+WORKDIR /workspace
+RUN git clone https://github.com/vllm-project/vllm.git && \
+    VLLM_USE_PRECOMPILED=1 uv pip install -e .
+
+WORKDIR /workspace/vllm
+ENV VLLM_SHA=270d05d9fdf9fc68767056204a1fee078358b122
+RUN git fetch && git checkout VLLM_SHA

From 550f8a052cae03c7e14a46767f689ab09c1cc28d Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Sat, 12 Jul 2025 19:57:11 +0000
Subject: [PATCH 3/5] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index beb001ca2f5..d66dbba91d3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -8,5 +8,5 @@ RUN git clone https://github.com/vllm-project/vllm.git && \
     VLLM_USE_PRECOMPILED=1 uv pip install -e .
 
 WORKDIR /workspace/vllm
-ENV VLLM_SHA=270d05d9fdf9fc68767056204a1fee078358b122
-RUN git fetch && git checkout VLLM_SHA
+ENV VLLM_SHA=8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e
+RUN git fetch && git checkout ${VLLM_SHA}

From 13729ad0afa55792aa94ad3becca08b486d292f9 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Sat, 12 Jul 2025 21:57:16 +0000
Subject: [PATCH 4/5] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 Dockerfile                  | 18 +++++++++----
 benchmarks/kernels/Justfile | 53 +++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 benchmarks/kernels/Justfile

diff --git a/Dockerfile b/Dockerfile
index d66dbba91d3..692d5059b7f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,12 +1,20 @@
 ARG CUDA_VERSION=12.8.1
-from nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
+FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
-RUN wget -qO- https://astral.sh/uv/install.sh | sh
+RUN apt update && apt install git -y && apt install curl -y
  
 WORKDIR /workspace
-RUN git clone https://github.com/vllm-project/vllm.git && \
-    VLLM_USE_PRECOMPILED=1 uv pip install -e .
+RUN git clone https://github.com/vllm-project/vllm.git
 
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+# Install vllm.
 WORKDIR /workspace/vllm
-ENV VLLM_SHA=8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e
+RUN uv venv .vllm --python 3.12
+RUN . .vllm/bin/activate && VLLM_USE_PRECOMPILED=1 uv pip install -e .
+
+# Checkout a specific commit.
+ENV VLLM_SHA=550f8a052cae03c7e14a46767f689ab09c1cc28d
 RUN git fetch && git checkout ${VLLM_SHA}
+
+ENTRYPOINT ["/bin/bash"]
diff --git a/benchmarks/kernels/Justfile b/benchmarks/kernels/Justfile
new file mode 100644
index 00000000000..6b2a8c67357
--- /dev/null
+++ b/benchmarks/kernels/Justfile
@@ -0,0 +1,53 @@
+llama-scout-bf16:
+  python3 benchmark_moe.py \
+    --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
+    --tp-size 1 \
+    --ep-size 8 \
+    --tune
+
+llama-scout-fp8:
+  python3 benchmark_moe.py \
+    --model meta-llama/Llama-4-Scout-17B-16E-Instruct \
+    --tp-size 1 \
+    --ep-size 8 \
+    --dtype fp8_w8a8 \
+    --tune
+
+llama-maverick:
+  python3 benchmark_moe.py \
+    --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
+    --tp-size 1 \
+    --ep-size 8 \
+    --dtype fp8_w8a8 \
+    --tune
+
+qwen-30b:
+  python3 benchmark_moe.py \
+    --model Qwen/Qwen3-30B-A3B \
+    --tp-size 1 \
+    --ep-size 8 \
+    --tune
+
+qwen-30b-fp8:
+  python3 benchmark_moe.py \
+    --model Qwen/Qwen3-30B-A3B-FP8 \
+    --tp-size 1 \
+    --ep-size 8 \
+    --dtype fp8_w8a8 \
+    --tune
+
+qwen-235b:
+  python3 benchmark_moe.py \
+    --model Qwen/Qwen3-235B-A22B \
+    --tp-size 1 \
+    --ep-size 8 \
+    --dtype fp8_w8a8 \
+    --tune
+
+deepseek-r1:
+  python3 benchmark_moe.py \
+    --model deepseek-ai/DeepSeek-R1-0528 \
+    --tp-size 1 \
+    --ep-size 8 \
+    --dtype fp8_w8a8 \
+    --tune

From 94e7c6dac7dc0765622f809005449e1faba62574 Mon Sep 17 00:00:00 2001
From: Robert Shaw <robshaw@redhat.com>
Date: Sat, 12 Jul 2025 22:38:42 +0000
Subject: [PATCH 5/5] updated

Signed-off-by: Robert Shaw <robshaw@redhat.com>
---
 benchmarks/kernels/Justfile | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/benchmarks/kernels/Justfile b/benchmarks/kernels/Justfile
index 6b2a8c67357..9e32aa8abc5 100644
--- a/benchmarks/kernels/Justfile
+++ b/benchmarks/kernels/Justfile
@@ -1,3 +1,13 @@
+all:
+    just llama-scout-bf16 && \
+    just llama-scout-fp8 && \
+    just llama-maverick && \
+    just qwen-30b && \
+    just qwen-30b-fp8 && \
+    just qwen-235b && \
+    just deepseek-r1 
+
+
 llama-scout-bf16:
   python3 benchmark_moe.py \
     --model meta-llama/Llama-4-Scout-17B-16E-Instruct \