From 270d05d9fdf9fc68767056204a1fee078358b122 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 12 Jul 2025 19:52:14 +0000 Subject: [PATCH 1/5] updated Signed-off-by: Robert Shaw --- benchmarks/kernels/benchmark_moe.py | 12 +++++++++++- tools/ep_kernels/install_python_libraries.sh | 6 +++--- .../layers/fused_moe/pplx_prepare_finalize.py | 5 ++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 51c9f68e43a..972d8a0ba6f 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -595,6 +595,13 @@ def main(args: argparse.Namespace): intermediate_size = config.intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + # Expert parallelism + if E % args.ep_size != 0: + raise ValueError( + f"Number of experts {E} must be divisible by expert parallel size {args.ep_size}" + ) + E = E // args.ep_size + hidden_size = config.hidden_size dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype use_fp8_w8a8 = args.dtype == "fp8_w8a8" @@ -724,7 +731,10 @@ def _distribute(method: str, inputs: list[Any]) -> list[Any]: "--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1" ) parser.add_argument( - "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2 + "--tp-size", "-tp", "--tensor-parallel-size", type=int, default=1 + ) + parser.add_argument( + "--ep-size", "-ep", "--expert-parallel-size", type=int, default=1 ) parser.add_argument( "--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto" diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 83643c084bf..a1d7f547d54 100644 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -11,7 +11,7 @@ if [ ! -d "$WORKSPACE" ]; then fi # install dependencies if not installed -pip3 install cmake torch ninja +uv pip install cmake torch ninja # build nvshmem pushd $WORKSPACE @@ -59,7 +59,7 @@ git clone https://github.com/ppl-ai/pplx-kernels cd pplx-kernels # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925 # PIP_NO_BUILD_ISOLATION=0 disables build isolation -PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e . +PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX uv pip install -vvv -e . popd # build and install deepep, require pytorch installed @@ -67,5 +67,5 @@ pushd $WORKSPACE git clone https://github.com/deepseek-ai/DeepEP cd DeepEP export NVSHMEM_DIR=$WORKSPACE/nvshmem_install -PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e . +PIP_NO_BUILD_ISOLATION=0 uv pip install -vvv -e . popd diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 4cd68608f02..b5f60193d7e 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -197,14 +197,13 @@ def prepare( # This argument is optional, defaults to indices.size(0) # There's not much point setting this unless it is != indices.size(0) bound_m: Optional[torch.Tensor] = None - self.a2a.dispatch( out_expert_num_tokens=expert_num_tokens, out_expert_x=expert_x, out_expert_x_scale=expert_x_scale, dp_x=a1q, dp_x_scale=a1q_scale, - indices=topk_ids, + indices=topk_ids.view(dtype=torch.uint32), bound_m=bound_m, ) @@ -249,7 +248,7 @@ def finalize( topk_weights = torch.ones_like(topk_weights) self.a2a.combine(out_tokens=output, - indices=topk_ids, + indices=topk_ids.view(dtype=torch.uint32), weights=topk_weights, expert_y=fused_expert_output, bound_m=bound_m) From 8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 12 Jul 2025 19:56:56 +0000 Subject: [PATCH 2/5] updated Signed-off-by: Robert Shaw --- Dockerfile | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000000..beb001ca2f5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +ARG CUDA_VERSION=12.8.1 +from nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 + +RUN wget -qO- https://astral.sh/uv/install.sh | sh + +WORKDIR /workspace +RUN git clone https://github.com/vllm-project/vllm.git && \ + VLLM_USE_PRECOMPILED=1 uv pip install -e . + +WORKDIR /workspace/vllm +ENV VLLM_SHA=270d05d9fdf9fc68767056204a1fee078358b122 +RUN git fetch && git checkout VLLM_SHA From 550f8a052cae03c7e14a46767f689ab09c1cc28d Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 12 Jul 2025 19:57:11 +0000 Subject: [PATCH 3/5] updated Signed-off-by: Robert Shaw --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index beb001ca2f5..d66dbba91d3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,5 +8,5 @@ RUN git clone https://github.com/vllm-project/vllm.git && \ VLLM_USE_PRECOMPILED=1 uv pip install -e . WORKDIR /workspace/vllm -ENV VLLM_SHA=270d05d9fdf9fc68767056204a1fee078358b122 -RUN git fetch && git checkout VLLM_SHA +ENV VLLM_SHA=8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e +RUN git fetch && git checkout ${VLLM_SHA} From 13729ad0afa55792aa94ad3becca08b486d292f9 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 12 Jul 2025 21:57:16 +0000 Subject: [PATCH 4/5] updated Signed-off-by: Robert Shaw --- Dockerfile | 18 +++++++++---- benchmarks/kernels/Justfile | 53 +++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 5 deletions(-) create mode 100644 benchmarks/kernels/Justfile diff --git a/Dockerfile b/Dockerfile index d66dbba91d3..692d5059b7f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,20 @@ ARG CUDA_VERSION=12.8.1 -from nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 -RUN wget -qO- https://astral.sh/uv/install.sh | sh +RUN apt update && apt install git -y && apt install curl -y WORKDIR /workspace -RUN git clone https://github.com/vllm-project/vllm.git && \ - VLLM_USE_PRECOMPILED=1 uv pip install -e . +RUN git clone https://github.com/vllm-project/vllm.git +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Install vllm. WORKDIR /workspace/vllm -ENV VLLM_SHA=8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e +RUN uv venv .vllm --python 3.12 +RUN . .vllm/bin/activate && VLLM_USE_PRECOMPILED=1 uv pip install -e . + +# Checkout a specific commit. +ENV VLLM_SHA=550f8a052cae03c7e14a46767f689ab09c1cc28d RUN git fetch && git checkout ${VLLM_SHA} + +ENTRYPOINT ["/bin/bash"] diff --git a/benchmarks/kernels/Justfile b/benchmarks/kernels/Justfile new file mode 100644 index 00000000000..6b2a8c67357 --- /dev/null +++ b/benchmarks/kernels/Justfile @@ -0,0 +1,53 @@ +llama-scout-bf16: + python3 benchmark_moe.py \ + --model meta-llama/Llama-4-Scout-17B-16E-Instruct \ + --tp-size 1 \ + --ep-size 8 \ + --tune + +llama-scout-fp8: + python3 benchmark_moe.py \ + --model meta-llama/Llama-4-Scout-17B-16E-Instruct \ + --tp-size 1 \ + --ep-size 8 \ + --dtype fp8_w8a8 \ + --tune + +llama-maverick: + python3 benchmark_moe.py \ + --model meta-llama/Llama-4-Maverick-17B-128E-Instruct \ + --tp-size 1 \ + --ep-size 8 \ + --dtype fp8_w8a8 \ + --tune + +qwen-30b: + python3 benchmark_moe.py \ + --model Qwen/Qwen3-30B-A3B \ + --tp-size 1 \ + --ep-size 8 \ + --tune + +qwen-30b-fp8: + python3 benchmark_moe.py \ + --model Qwen/Qwen3-30B-A3B-FP8 \ + --tp-size 1 \ + --ep-size 8 \ + --dtype fp8_w8a8 \ + --tune + +qwen-235b: + python3 benchmark_moe.py \ + --model Qwen/Qwen3-235B-A22B \ + --tp-size 1 \ + --ep-size 8 \ + --dtype fp8_w8a8 \ + --tune + +deepseek-r1: + python3 benchmark_moe.py \ + --model deepseek-ai/DeepSeek-R1-0528 \ + --tp-size 1 \ + --ep-size 8 \ + --dtype fp8_w8a8 \ + --tune From 94e7c6dac7dc0765622f809005449e1faba62574 Mon Sep 17 00:00:00 2001 From: Robert Shaw Date: Sat, 12 Jul 2025 22:38:42 +0000 Subject: [PATCH 5/5] updated Signed-off-by: Robert Shaw --- benchmarks/kernels/Justfile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/benchmarks/kernels/Justfile b/benchmarks/kernels/Justfile index 6b2a8c67357..9e32aa8abc5 100644 --- a/benchmarks/kernels/Justfile +++ b/benchmarks/kernels/Justfile @@ -1,3 +1,13 @@ +all: + just llama-scout-bf16 && \ + just llama-scout-fp8 && \ + just llama-maverick && \ + just qwen-30b && \ + just qwen-30b-fp8 && \ + just qwen-235b && \ + just deepseek-r1 + + llama-scout-bf16: python3 benchmark_moe.py \ --model meta-llama/Llama-4-Scout-17B-16E-Instruct \