Skip to content

Commit 97aa052

Browse files
committed
rebase.
Signed-off-by: Elfie Guo <elfieg@nvidia.com>
2 parents 90397d5 + 85bd659 commit 97aa052

File tree

349 files changed

+21586
-4502
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

349 files changed

+21586
-4502
lines changed

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
4646
done
4747

4848
lm_eval --model vllm \
49-
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
49+
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
5050
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
5151
--batch_size "$BATCH_SIZE"

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818

1919
def launch_lm_eval(eval_config, tp_size):
2020
trust_remote_code = eval_config.get("trust_remote_code", False)
21+
max_model_len = eval_config.get("max_model_len", 4096)
2122
model_args = (
2223
f"pretrained={eval_config['model_name']},"
2324
f"tensor_parallel_size={tp_size},"
2425
f"enforce_eager=true,"
2526
f"add_bos_token=true,"
26-
f"trust_remote_code={trust_remote_code}"
27+
f"trust_remote_code={trust_remote_code},"
28+
f"max_model_len={max_model_len}"
2729
)
2830
results = lm_eval.simple_evaluate(
2931
model="vllm",

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@ docker run \
2727
"${image_name}" \
2828
sh -c '
2929
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
30+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
31+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
3032
cd tests
3133
pytest -v -s v1/core
3234
'

.buildkite/scripts/tpu/docker_run_bm.sh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@ trap remove_docker_container EXIT
2222
# Remove the container that might not be cleaned up in the previous run.
2323
remove_docker_container
2424

25-
# Build docker image.
26-
# TODO: build the image outside the script and share the image with other
27-
# tpu test if building time is too long.
28-
DOCKER_BUILDKIT=1 docker build \
29-
--build-arg max_jobs=16 \
30-
--build-arg USE_SCCACHE=1 \
31-
--build-arg GIT_REPO_CHECK=0 \
32-
--tag vllm/vllm-tpu-bm \
33-
--progress plain -f docker/Dockerfile.tpu .
34-
3525
LOG_ROOT=$(mktemp -d)
3626
# If mktemp fails, set -e will cause the script to exit.
3727
echo "Results will be stored in: $LOG_ROOT"

.buildkite/test-pipeline.yaml

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,16 +117,14 @@ steps:
117117
commands:
118118
- pytest -v -s core
119119

120-
- label: Entrypoints Test # 40min
120+
- label: Entrypoints Test (LLM) # 40min
121121
mirror_hardwares: [amdexperimental]
122122
working_dir: "/vllm-workspace/tests"
123123
fast_check: true
124124
torch_nightly: true
125125
source_file_dependencies:
126126
- vllm/
127127
- tests/entrypoints/llm
128-
- tests/entrypoints/openai
129-
- tests/entrypoints/test_chat_utils
130128
- tests/entrypoints/offline_mode
131129
commands:
132130
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
@@ -135,9 +133,21 @@ steps:
135133
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
136134
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
137135
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
136+
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
137+
138+
- label: Entrypoints Test (API Server) # 40min
139+
mirror_hardwares: [amdexperimental]
140+
working_dir: "/vllm-workspace/tests"
141+
fast_check: true
142+
torch_nightly: true
143+
source_file_dependencies:
144+
- vllm/
145+
- tests/entrypoints/openai
146+
- tests/entrypoints/test_chat_utils
147+
commands:
148+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
138149
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
139150
- pytest -v -s entrypoints/test_chat_utils.py
140-
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
141151

142152
- label: Distributed Tests (4 GPUs) # 10min
143153
mirror_hardwares: [amdexperimental]
@@ -282,7 +292,7 @@ steps:
282292
- python3 offline_inference/llm_engine_example.py
283293
- python3 offline_inference/audio_language.py --seed 0
284294
- python3 offline_inference/vision_language.py --seed 0
285-
- python3 offline_inference/vision_language_embedding.py --seed 0
295+
- python3 offline_inference/vision_language_pooling.py --seed 0
286296
- python3 offline_inference/vision_language_multi_image.py --seed 0
287297
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
288298
- python3 offline_inference/encoder_decoder.py
@@ -630,6 +640,18 @@ steps:
630640
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
631641
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
632642

643+
- label: Transformers Nightly Models Test
644+
working_dir: "/vllm-workspace/"
645+
optional: true
646+
commands:
647+
- pip install --upgrade git+https://github.com/huggingface/transformers
648+
- pytest -v -s models/test_initialization.py
649+
- pytest -v -s tests/models/multimodal/processing/
650+
- pytest -v -s tests/models/multimodal/test_mapping.py
651+
- python3 examples/offline_inference/basic/chat.py
652+
- python3 examples/offline_inference/audio_language.py --model-type whisper
653+
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
654+
633655
##### 1 GPU test #####
634656
##### multi gpus test #####
635657

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
/vllm/lora @jeejeelee
1717
/vllm/reasoning @aarnphm
1818
/vllm/entrypoints @aarnphm
19+
/vllm/compilation @zou3519 @youkaichao
1920
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
2021

2122
# Any change to the VllmConfig changes can have a large user-facing impact,

.github/mergify.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ pull_request_rules:
8686
- and:
8787
- files~=^vllm/model_executor/models/
8888
- files=vllm/model_executor/models/registry.py
89-
- files=tests/models/registry.py
90-
- files=docs/models/supported_models.md
9189
actions:
9290
label:
9391
add:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ venv.bak/
146146

147147
# mkdocs documentation
148148
/site
149+
docs/argparse
149150
docs/examples
150151

151152
# mypy

CMakeLists.txt

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,6 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
171171
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
172172
endif()
173173

174-
175174
#
176175
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
177176
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
@@ -232,7 +231,6 @@ endif()
232231

233232
set(VLLM_EXT_SRC
234233
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
235-
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
236234
"csrc/cache_kernels.cu"
237235
"csrc/attention/paged_attention_v1.cu"
238236
"csrc/attention/paged_attention_v2.cu"
@@ -393,7 +391,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
393391
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
394392
# CUDA 12.0 or later
395393
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
396-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
394+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
397395
set(SRCS
398396
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
399397
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@@ -409,7 +407,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
409407
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
410408
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
411409
else()
412-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
410+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
413411
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
414412
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
415413
"later if you intend on running FP8 quantized models on "
@@ -424,7 +422,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
424422
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
425423
# CUDA 12.8 or later
426424
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
427-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
425+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
428426
set(SRCS
429427
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
430428
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
@@ -438,7 +436,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
438436
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
439437
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
440438
else()
441-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
439+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
442440
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
443441
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
444442
"later if you intend on running FP8 quantized models on "
@@ -453,7 +451,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
453451
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
454452
# require CUDA 12.8 or later
455453
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
456-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
454+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
457455
set(SRCS
458456
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
459457
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
@@ -468,7 +466,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
468466
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
469467
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
470468
else()
471-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
469+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
472470
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
473471
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
474472
"later if you intend on running FP8 quantized models on "
@@ -511,7 +509,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
511509
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
512510
# require CUDA 12.2 or later (and only work on Hopper).
513511
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
514-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
512+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
515513
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
516514
set_gencode_flags_for_srcs(
517515
SRCS "${SRCS}"
@@ -520,7 +518,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
520518
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
521519
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
522520
else()
523-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
521+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
524522
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
525523
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
526524
"if you intend on running FP8 sparse quantized models on Hopper.")
@@ -532,7 +530,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
532530

533531
# FP4 Archs and flags
534532
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
535-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
533+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
536534
set(SRCS
537535
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
538536
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
@@ -553,9 +551,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
553551

554552
# CUTLASS MLA Archs and flags
555553
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
556-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
554+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
557555
set(SRCS
558-
"csrc/attention/mla/cutlass_mla_kernels.cu")
556+
"csrc/attention/mla/cutlass_mla_kernels.cu"
557+
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
559558
set_gencode_flags_for_srcs(
560559
SRCS "${SRCS}"
561560
CUDA_ARCHS "${MLA_ARCHS}")
@@ -642,7 +641,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
642641
# The machete kernels only work on hopper and require CUDA 12.0 or later.
643642
# Only build Machete kernels if we are building for something compatible with sm90a
644643
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
645-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
644+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
646645
#
647646
# For the Machete kernels we automatically generate sources for various
648647
# preselected input type pairs and schedules.
@@ -694,7 +693,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
694693

695694
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
696695
else()
697-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
696+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
698697
AND MACHETE_ARCHS)
699698
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
700699
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "

README.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,11 @@ vLLM is fast with:
6363
- Speculative decoding
6464
- Chunked prefill
6565

66-
**Performance benchmark**: We include a performance benchmark at the end of [our blog post](https://blog.vllm.ai/2024/09/05/perf-update.html). It compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [SGLang](https://github.com/sgl-project/sglang) and [LMDeploy](https://github.com/InternLM/lmdeploy)). The implementation is under [nightly-benchmarks folder](.buildkite/nightly-benchmarks/) and you can [reproduce](https://github.com/vllm-project/vllm/issues/8176) this benchmark using our one-click runnable script.
67-
6866
vLLM is flexible and easy to use with:
6967

7068
- Seamless integration with popular Hugging Face models
7169
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
72-
- Tensor parallelism and pipeline parallelism support for distributed inference
70+
- Tensor, pipeline, data and expert parallelism support for distributed inference
7371
- Streaming outputs
7472
- OpenAI-compatible API server
7573
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron

0 commit comments

Comments
 (0)