Skip to content

Commit 75fee5b

Browse files
committed
Merge upstream/main into feature/enable-log-outputs
Resolved merge conflicts in vllm/entrypoints/openai/api_server.py while preserving logger enhancements and SSE decoding added in this branch. All logger tests pass. Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
2 parents e864415 + 5f0af36 commit 75fee5b

File tree

196 files changed

+10235
-2391
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

196 files changed

+10235
-2391
lines changed

.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
4646
done
4747

4848
lm_eval --model vllm \
49-
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
49+
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
5050
--tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
5151
--batch_size "$BATCH_SIZE"

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@
1818

1919
def launch_lm_eval(eval_config, tp_size):
2020
trust_remote_code = eval_config.get("trust_remote_code", False)
21+
max_model_len = eval_config.get("max_model_len", 4096)
2122
model_args = (
2223
f"pretrained={eval_config['model_name']},"
2324
f"tensor_parallel_size={tp_size},"
2425
f"enforce_eager=true,"
2526
f"add_bos_token=true,"
26-
f"trust_remote_code={trust_remote_code}"
27+
f"trust_remote_code={trust_remote_code},"
28+
f"max_model_len={max_model_len}"
2729
)
2830
results = lm_eval.simple_evaluate(
2931
model="vllm",

.buildkite/scripts/tpu/docker_run_bm.sh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,6 @@ trap remove_docker_container EXIT
2222
# Remove the container that might not be cleaned up in the previous run.
2323
remove_docker_container
2424

25-
# Build docker image.
26-
# TODO: build the image outside the script and share the image with other
27-
# tpu test if building time is too long.
28-
DOCKER_BUILDKIT=1 docker build \
29-
--build-arg max_jobs=16 \
30-
--build-arg USE_SCCACHE=1 \
31-
--build-arg GIT_REPO_CHECK=0 \
32-
--tag vllm/vllm-tpu-bm \
33-
--progress plain -f docker/Dockerfile.tpu .
34-
3525
LOG_ROOT=$(mktemp -d)
3626
# If mktemp fails, set -e will cause the script to exit.
3727
echo "Results will be stored in: $LOG_ROOT"

.buildkite/test-pipeline.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,7 @@ steps:
282282
- python3 offline_inference/llm_engine_example.py
283283
- python3 offline_inference/audio_language.py --seed 0
284284
- python3 offline_inference/vision_language.py --seed 0
285-
- python3 offline_inference/vision_language_embedding.py --seed 0
285+
- python3 offline_inference/vision_language_pooling.py --seed 0
286286
- python3 offline_inference/vision_language_multi_image.py --seed 0
287287
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
288288
- python3 offline_inference/encoder_decoder.py

.github/mergify.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ pull_request_rules:
8686
- and:
8787
- files~=^vllm/model_executor/models/
8888
- files=vllm/model_executor/models/registry.py
89-
- files=tests/models/registry.py
90-
- files=docs/models/supported_models.md
9189
actions:
9290
label:
9391
add:

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ venv.bak/
146146

147147
# mkdocs documentation
148148
/site
149+
docs/argparse
149150
docs/examples
150151

151152
# mypy

CMakeLists.txt

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
171171
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
172172
endif()
173173

174+
#
175+
# Set nvcc fatbin compression.
176+
#
177+
if(VLLM_GPU_LANG STREQUAL "CUDA")
178+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
179+
list(APPEND VLLM_GPU_FLAGS "-Xfatbin" "-compress-all" "-compress-mode=size")
180+
endif()
181+
endif()
182+
174183

175184
#
176185
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
@@ -392,7 +401,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
392401
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
393402
# CUDA 12.0 or later
394403
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
395-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
404+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
396405
set(SRCS
397406
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
398407
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
@@ -408,7 +417,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
408417
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
409418
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
410419
else()
411-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
420+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
412421
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
413422
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
414423
"later if you intend on running FP8 quantized models on "
@@ -423,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
423432
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
424433
# CUDA 12.8 or later
425434
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
426-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
435+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
427436
set(SRCS
428437
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
429438
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
@@ -437,7 +446,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
437446
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
438447
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
439448
else()
440-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
449+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
441450
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
442451
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
443452
"later if you intend on running FP8 quantized models on "
@@ -452,7 +461,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
452461
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
453462
# require CUDA 12.8 or later
454463
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
455-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
464+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
456465
set(SRCS
457466
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
458467
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
@@ -467,7 +476,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
467476
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
468477
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
469478
else()
470-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
479+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
471480
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
472481
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
473482
"later if you intend on running FP8 quantized models on "
@@ -510,7 +519,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
510519
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
511520
# require CUDA 12.2 or later (and only work on Hopper).
512521
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
513-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
522+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
514523
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
515524
set_gencode_flags_for_srcs(
516525
SRCS "${SRCS}"
@@ -519,7 +528,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
519528
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
520529
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
521530
else()
522-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
531+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.2 AND SCALED_MM_ARCHS)
523532
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
524533
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
525534
"if you intend on running FP8 sparse quantized models on Hopper.")
@@ -531,7 +540,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
531540

532541
# FP4 Archs and flags
533542
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
534-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
543+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
535544
set(SRCS
536545
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
537546
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
@@ -552,7 +561,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
552561

553562
# CUTLASS MLA Archs and flags
554563
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
555-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
564+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
556565
set(SRCS
557566
"csrc/attention/mla/cutlass_mla_kernels.cu")
558567
set_gencode_flags_for_srcs(
@@ -641,7 +650,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
641650
# The machete kernels only work on hopper and require CUDA 12.0 or later.
642651
# Only build Machete kernels if we are building for something compatible with sm90a
643652
cuda_archs_loose_intersection(MACHETE_ARCHS "9.0a" "${CUDA_ARCHS}")
644-
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND MACHETE_ARCHS)
653+
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND MACHETE_ARCHS)
645654
#
646655
# For the Machete kernels we automatically generate sources for various
647656
# preselected input type pairs and schedules.
@@ -693,7 +702,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
693702

694703
message(STATUS "Building Machete kernels for archs: ${MACHETE_ARCHS}")
695704
else()
696-
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0
705+
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
697706
AND MACHETE_ARCHS)
698707
message(STATUS "Not building Machete kernels as CUDA Compiler version is "
699708
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ vLLM is flexible and easy to use with:
6969

7070
- Seamless integration with popular Hugging Face models
7171
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
72-
- Tensor parallelism and pipeline parallelism support for distributed inference
72+
- Tensor, pipeline, data and expert parallelism support for distributed inference
7373
- Streaming outputs
7474
- OpenAI-compatible API server
7575
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron

benchmarks/benchmark_dataset.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,9 @@ def sample(
324324
input_low = int(real_input_len * (1 - range_ratio))
325325
input_high = int(real_input_len * (1 + range_ratio))
326326
output_low = int(output_len * (1 - range_ratio))
327+
# Ensure the lower bound for output length is at least 1 to prevent
328+
# sampling 0 tokens, which can cause request failures.
329+
output_low = max(output_low, 1)
327330
output_high = int(output_len * (1 + range_ratio))
328331

329332
# Add logging for debugging
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import itertools
4+
from typing import Callable
5+
6+
import torch
7+
8+
from vllm import _custom_ops as ops
9+
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
10+
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
11+
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
12+
from vllm.triton_utils import triton
13+
14+
15+
# TODO(luka): use standalone_compile utility
16+
def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
17+
def inner(*args):
18+
torch._dynamo.mark_dynamic(args[arg_index], dim_index)
19+
return fn(*args)
20+
21+
return inner
22+
23+
24+
torch._dynamo.config.recompile_limit = 8888
25+
compilation_config = CompilationConfig(custom_ops=["none"])
26+
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
27+
torch_per_token_quant_fp8 = torch.compile(
28+
QuantFP8(False, GroupShape.PER_TOKEN),
29+
fullgraph=True,
30+
dynamic=False, # recompile for different shapes
31+
)
32+
33+
# First dim is explicitly dynamic to simulate vLLM usage
34+
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
35+
36+
37+
def cuda_per_token_quant_fp8(
38+
input: torch.Tensor,
39+
) -> tuple[torch.Tensor, torch.Tensor]:
40+
return ops.scaled_fp8_quant(input)
41+
42+
43+
def calculate_diff(batch_size: int, seq_len: int):
44+
"""Calculate difference between Triton and CUDA implementations."""
45+
device = torch.device("cuda")
46+
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
47+
48+
torch_out, torch_scale = torch_per_token_quant_fp8(x)
49+
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
50+
51+
if torch.allclose(
52+
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
53+
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
54+
print("✅ All implementations match")
55+
else:
56+
print("❌ Implementations differ")
57+
58+
59+
batch_size_range = [1, 16, 32, 64, 128]
60+
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
61+
62+
configs = list(itertools.product(batch_size_range, seq_len_range))
63+
64+
65+
@triton.testing.perf_report(
66+
triton.testing.Benchmark(
67+
x_names=["batch_size", "seq_len"],
68+
x_vals=configs,
69+
line_arg="provider",
70+
line_vals=["torch", "cuda"],
71+
line_names=["Torch", "CUDA"],
72+
styles=[("blue", "-"), ("green", "-")],
73+
ylabel="us",
74+
plot_name="per-token-dynamic-quant-fp8-performance",
75+
args={},
76+
)
77+
)
78+
def benchmark_quantization(batch_size, seq_len, provider):
79+
dtype = torch.float16
80+
device = torch.device("cuda")
81+
82+
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
83+
84+
quantiles = [0.5, 0.2, 0.8]
85+
86+
if provider == "torch":
87+
fn = lambda: torch_per_token_quant_fp8(x.clone())
88+
elif provider == "cuda":
89+
fn = lambda: cuda_per_token_quant_fp8(x.clone())
90+
91+
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
92+
93+
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
94+
95+
96+
if __name__ == "__main__":
97+
calculate_diff(batch_size=4, seq_len=4096)
98+
benchmark_quantization.run(print_data=True)

0 commit comments

Comments
 (0)