Skip to content

Commit 55bf4bc

Browse files
committed
Merge remote-tracking branch 'upstream/main' into eagle-kernel-fusion
2 parents c67eb72 + e1f5a71 commit 55bf4bc

File tree

107 files changed

+2751
-873
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+2751
-873
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ steps:
1414
agents:
1515
queue: cpu_queue_postmerge
1616
commands:
17-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
17+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
1818
- "mkdir artifacts"
1919
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
2020
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -31,7 +31,7 @@ steps:
3131
agents:
3232
queue: cpu_queue_postmerge
3333
commands:
34-
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
34+
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
3535
- "mkdir artifacts"
3636
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
3737
- "bash .buildkite/scripts/upload-wheels.sh"
@@ -64,7 +64,7 @@ steps:
6464
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
6565
plugins:
6666
- docker-login#v3.0.0:
67-
username: vllm
67+
username: vllmbot
6868
password-env: DOCKERHUB_TOKEN
6969
env:
7070
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
1111
HF_CACHE="$(realpath ~)/huggingface"
1212
mkdir -p "${HF_CACHE}"
1313
HF_MOUNT="/root/.cache/huggingface"
14+
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
1415

1516
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
1617
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
1718
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
1819

1920
# Try building the docker image
20-
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
21+
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
2122

2223
# prune old image and containers to save disk space, and only once a day
2324
# by using a timestamp file in tmp.
@@ -47,6 +48,7 @@ trap remove_docker_container EXIT
4748
docker run --rm -it --device=/dev/neuron0 --network bridge \
4849
-v "${HF_CACHE}:${HF_MOUNT}" \
4950
-e "HF_HOME=${HF_MOUNT}" \
51+
-e "HF_TOKEN=${HF_TOKEN}" \
5052
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
5153
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
5254
--name "${container_name}" \

.github/CODEOWNERS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
/vllm/model_executor/guided_decoding @mgoin @russellb
1414
/vllm/multimodal @DarkLight1337 @ywang96
1515
/vllm/vllm_flash_attn @LucasWilkinson
16+
/vllm/lora @jeejeelee
1617
CMakeLists.txt @tlrmchlsmth
1718

1819
# vLLM V1
@@ -40,3 +41,4 @@ CMakeLists.txt @tlrmchlsmth
4041
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
4142
/tests/v1/structured_output @mgoin @russellb
4243
/tests/weight_loading @mgoin @youkaichao
44+
/tests/lora @jeejeelee

.github/mergify.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,17 @@ pull_request_rules:
163163
164164
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
165165
166+
- name: assign reviewer for tensorizer changes
167+
conditions:
168+
- files~=^vllm/model_executor/model_loader/tensorizer.py
169+
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
170+
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
171+
- files~=^tests/tensorizer_loader/
172+
actions:
173+
assign:
174+
users:
175+
- "sangstar"
176+
166177
- name: remove 'needs-rebase' label when conflict is resolved
167178
conditions:
168179
- -conflict

csrc/cutlass_extensions/common.hpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,6 @@
1515
cutlassGetStatusString(error)); \
1616
}
1717

18-
/**
19-
* Panic wrapper for unwinding CUDA runtime errors
20-
*/
21-
#define CUDA_CHECK(status) \
22-
{ \
23-
cudaError_t error = status; \
24-
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
25-
}
26-
2718
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
2819
int max_shared_mem_per_block_opt_in = 0;
2920
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,

csrc/moe/marlin_moe_wna16/marlin_template.h

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1767,17 +1767,20 @@ __global__ void Marlin(
17671767

17681768
if constexpr (has_act_order) {
17691769
slice_k_start += tb_k * stages;
1770-
slice_k_start_shared_fetch += tb_k * stages;
1771-
int first_group_id = g_idx[slice_k_start];
1772-
int last_g_idx = slice_k_start + stages * tb_k * 2;
1773-
if (last_g_idx >= prob_k) {
1774-
last_g_idx = prob_k - 1;
1775-
}
1776-
int last_group_id = g_idx[last_g_idx];
1777-
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1778-
fetch_act_order_scales_to_shared(false, first_group_id,
1779-
last_group_id);
1780-
__syncthreads();
1770+
1771+
if (slice_k_start < prob_k) {
1772+
slice_k_start_shared_fetch += tb_k * stages;
1773+
int first_group_id = g_idx[slice_k_start];
1774+
int last_g_idx = slice_k_start + stages * tb_k * 2;
1775+
if (last_g_idx >= prob_k) {
1776+
last_g_idx = prob_k - 1;
1777+
}
1778+
int last_group_id = g_idx[last_g_idx];
1779+
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1780+
fetch_act_order_scales_to_shared(false, first_group_id,
1781+
last_group_id);
1782+
__syncthreads();
1783+
}
17811784
}
17821785
}
17831786
if (slice_iters == 0) {

csrc/quantization/gptq_marlin/marlin_template.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,16 +1588,20 @@ __global__ void Marlin(
15881588

15891589
if constexpr (has_act_order) {
15901590
slice_k_start += tb_k * stages;
1591-
slice_k_start_shared_fetch += tb_k * stages;
1592-
int first_group_id = g_idx[slice_k_start];
1593-
int last_g_idx = slice_k_start + stages * tb_k * 2;
1594-
if (last_g_idx >= prob_k) {
1595-
last_g_idx = prob_k - 1;
1596-
}
1597-
int last_group_id = g_idx[last_g_idx];
1598-
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1599-
fetch_act_order_scales_to_shared(false, first_group_id, last_group_id);
1600-
__syncthreads();
1591+
1592+
if (slice_k_start < prob_k) {
1593+
slice_k_start_shared_fetch += tb_k * stages;
1594+
int first_group_id = g_idx[slice_k_start];
1595+
int last_g_idx = slice_k_start + stages * tb_k * 2;
1596+
if (last_g_idx >= prob_k) {
1597+
last_g_idx = prob_k - 1;
1598+
}
1599+
int last_group_id = g_idx[last_g_idx];
1600+
if (last_group_id >= sh_first_group_id + sh_num_groups) {
1601+
fetch_act_order_scales_to_shared(false, first_group_id,
1602+
last_group_id);
1603+
__syncthreads();
1604+
}
16011605
}
16021606
}
16031607

csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include <ATen/cuda/CUDAContext.h>
1010

11+
#include "cuda_utils.h"
12+
1113
#include "cutlass/cutlass.h"
1214

1315
#include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm {
9597
// clang-format off
9698
using CollectiveMainloop =
9799
typename cutlass::gemm::collective::CollectiveBuilder<
98-
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
99-
ElementAB, cutlass::layout::RowMajor, AlignmentAB,
100-
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
100+
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
101+
ElementAB, cutlass::layout::RowMajor, AlignmentAB,
102+
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
101103
ElementAcc, TileShape, ClusterShape,
102104
Stages,
103105
KernelSchedule>::CollectiveOp;

docker/Dockerfile

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
7777
# can be useful for both `dev` and `test`
7878
# explicitly set the list to avoid issues with torch 2.2
7979
# see https://github.com/pytorch/pytorch/pull/123243
80-
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
80+
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
8181
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
8282
# Override the arch list for flash-attn to reduce the binary size
8383
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
@@ -255,9 +255,15 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
255255
RUN --mount=type=cache,target=/root/.cache/uv \
256256
. /etc/environment && \
257257
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
258+
# uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
258259
# TESTING: install FlashInfer from source to test 2.7.0 final RC
259-
FLASHINFER_ENABLE_AOT=1 TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX' \
260-
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.2.post1" ; \
260+
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
261+
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \
262+
else \
263+
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
264+
fi && \
265+
export FLASHINFER_ENABLE_AOT=1; \
266+
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
261267
fi
262268
COPY examples examples
263269
COPY benchmarks benchmarks

docker/Dockerfile.ppc64le

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,8 @@ ENV UV_LINK_MODE=copy
2121
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
2222
# when `--jobs=<N>` is passed with podman build command
2323
RUN microdnf install -y openssl-devel dnf \
24-
&& dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
25-
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
26-
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
27-
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
28-
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
29-
&& dnf config-manager --set-enabled crb \
24+
&& dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
25+
&& dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
3026
&& dnf install -y \
3127
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
3228
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \

0 commit comments

Comments
 (0)