Skip to content

Commit 8c211e5

Browse files
committed
Merge remote-tracking branch 'upstream/main'
2 parents c3f61dd + d5615af commit 8c211e5

File tree

287 files changed

+4324
-4116
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

287 files changed

+4324
-4116
lines changed

.buildkite/lm-eval-harness/test_lm_eval_correctness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import pytest
1717
import yaml
1818

19-
RTOL = 0.05
19+
RTOL = 0.08
2020
TEST_DATA_FILE = os.environ.get(
2121
"LM_EVAL_TEST_DATA_FILE",
2222
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,30 @@
55
set -ex
66

77
# Setup cleanup
8-
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
8+
remove_docker_container() {
9+
if [[ -n "$container_id" ]]; then
10+
podman rm -f "$container_id" || true
11+
fi
12+
podman system prune -f
13+
}
914
trap remove_docker_container EXIT
1015
remove_docker_container
1116

1217
# Try building the docker image
1318
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
1419

1520
# Run the image
16-
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
21+
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
1722

1823
function cpu_tests() {
1924

2025
# offline inference
21-
podman exec cpu-test-ubi9-ppc bash -c "
26+
podman exec -it "$container_id" bash -c "
2227
set -e
2328
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
2429

2530
# Run basic model test
26-
podman exec cpu-test-ubi9-ppc bash -c "
31+
podman exec -it "$container_id" bash -c "
2732
set -e
2833
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
2934
pip install sentence-transformers datamodel_code_generator
@@ -33,6 +38,8 @@ function cpu_tests() {
3338
}
3439

3540
# All of CPU tests are expected to be finished less than 40 mins.
41+
42+
export container_id
3643
export -f cpu_tests
3744
timeout 40m bash -c cpu_tests
3845

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ docker run --privileged --net host --shm-size=16G -it \
1919
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
2020
&& python3 -m pip install pytest pytest-asyncio tpu-info \
2121
&& python3 -m pip install lm_eval[api]==0.4.4 \
22+
&& export VLLM_XLA_CACHE_PATH= \
2223
&& export VLLM_USE_V1=1 \
2324
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
2425
&& echo HARDWARE \
@@ -44,7 +45,9 @@ docker run --privileged --net host --shm-size=16G -it \
4445
&& echo TEST_9 \
4546
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
4647
&& echo TEST_10 \
47-
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" \
48+
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
49+
&& echo TEST_11 \
50+
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
4851

4952

5053
# TODO: This test fails because it uses RANDOM_SEED sampling

.buildkite/test-pipeline.yaml

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
# Documentation
99
# label(str): the name of the test. emoji allowed.
1010
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
11+
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
1112
# fast_check_only(bool): run this test on fastcheck pipeline only
1213
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
1314
# command(str): the single command to run for tests. incompatible with commands.
@@ -70,6 +71,7 @@ steps:
7071
- label: Basic Correctness Test # 30min
7172
#mirror_hardwares: [amd]
7273
fast_check: true
74+
torch_nightly: true
7375
source_file_dependencies:
7476
- vllm/
7577
- tests/basic_correctness/test_basic_correctness
@@ -106,6 +108,7 @@ steps:
106108
- label: Entrypoints Test # 40min
107109
working_dir: "/vllm-workspace/tests"
108110
fast_check: true
111+
torch_nightly: true
109112
#mirror_hardwares: [amd]
110113
amd_gpus: 2 # Just for the sake of queue testing
111114
source_file_dependencies:
@@ -210,6 +213,7 @@ steps:
210213
- pytest -v -s v1/worker
211214
- pytest -v -s v1/structured_output
212215
- pytest -v -s v1/spec_decode
216+
- pytest -v -s v1/test_serial_utils.py
213217
- pytest -v -s v1/test_stats.py
214218
- pytest -v -s v1/test_utils.py
215219
- pytest -v -s v1/test_oracle.py
@@ -327,11 +331,43 @@ steps:
327331
amd_gpus: 8
328332
source_file_dependencies:
329333
- csrc/
334+
- tests/kernels/core
335+
commands:
336+
- pytest -v -s kernels/core
337+
338+
- label: Kernels Attention Test %N
339+
source_file_dependencies:
340+
- csrc/attention/
330341
- vllm/attention
331-
- tests/kernels
342+
- vllm/v1/attention
343+
- tests/kernels/attention
332344
commands:
333-
- pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
334-
parallelism: 4
345+
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
346+
parallelism: 2
347+
348+
- label: Kernels Quantization Test %N
349+
source_file_dependencies:
350+
- csrc/quantization/
351+
- vllm/model_executor/layers/quantization
352+
- tests/kernels/quantization
353+
commands:
354+
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
355+
parallelism: 2
356+
357+
- label: Kernels MoE Test
358+
source_file_dependencies:
359+
- csrc/moe/
360+
- tests/kernels/moe
361+
- vllm/model_executor/layers/fused_moe/
362+
commands:
363+
- pytest -v -s kernels/moe
364+
365+
- label: Kernels Mamba Test
366+
source_file_dependencies:
367+
- csrc/mamba/
368+
- tests/kernels/mamba
369+
commands:
370+
- pytest -v -s kernels/mamba
335371

336372
- label: Tensorizer Test # 11min
337373
working_dir: "/vllm-workspace/tests"

.github/mergify.yml

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,19 @@ pull_request_rules:
5555
description: Automatically apply structured-output label
5656
conditions:
5757
- or:
58+
- files~=^benchmarks/structured_schemas/
59+
- files=benchmarks/benchmark_serving_structured_output.py
60+
- files=benchmarks/run_structured_output_benchmark.sh
61+
- files=docs/source/features/structured_outputs.md
62+
- files=examples/offline_inference/structured_outputs.py
63+
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
64+
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
5865
- files~=^vllm/model_executor/guided_decoding/
5966
- files=tests/model_executor/test_guided_processors.py
6067
- files=tests/entrypoints/llm/test_guided_generate.py
61-
- files=benchmarks/benchmark_serving_guided.py
62-
- files=benchmarks/benchmark_guided.py
68+
- files~=^tests/v1/structured_output/
69+
- files=tests/v1/entrypoints/llm/test_guided_generate.py
70+
- files~=^vllm/v1/structured_output/
6371
actions:
6472
label:
6573
add:
@@ -118,6 +126,28 @@ pull_request_rules:
118126
remove:
119127
- tpu
120128

129+
- name: label-tool-calling
130+
description: Automatically add tool-calling label
131+
conditions:
132+
- or:
133+
- files~=^tests/tool_use/
134+
- files~=^tests/mistral_tool_use/
135+
- files~=^tests/entrypoints/openai/tool_parsers/
136+
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
137+
- files~=^vllm/entrypoints/openai/tool_parsers/
138+
- files=docs/source/features/tool_calling.md
139+
- files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
140+
- files=docs/source/getting_started/examples/chat_with_tools.md
141+
- files~=^examples/tool_chat_*
142+
- files=examples/offline_inference/chat_with_tools.py
143+
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
144+
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
145+
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
146+
actions:
147+
label:
148+
add:
149+
- tool-calling
150+
121151
- name: ping author on conflicts and add 'needs-rebase' label
122152
conditions:
123153
- conflict

benchmarks/benchmark_serving.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ def main(args: argparse.Namespace):
713713
))
714714

715715
# Save config and results to json
716-
if args.save_result:
716+
if args.save_result or args.append_result:
717717
result_json: dict[str, Any] = {}
718718

719719
# Setup
@@ -734,6 +734,14 @@ def main(args: argparse.Namespace):
734734
raise ValueError(
735735
"Invalid metadata format. Please use KEY=VALUE format."
736736
)
737+
# Traffic
738+
result_json["request_rate"] = (args.request_rate if args.request_rate
739+
< float("inf") else "inf")
740+
result_json["burstiness"] = args.burstiness
741+
result_json["max_concurrency"] = args.max_concurrency
742+
743+
# Merge with benchmark result
744+
result_json = {**result_json, **benchmark_result}
737745

738746
if not args.save_detailed:
739747
# Remove fields with too many data points
@@ -744,15 +752,6 @@ def main(args: argparse.Namespace):
744752
if field in result_json:
745753
del result_json[field]
746754

747-
# Traffic
748-
result_json["request_rate"] = (args.request_rate if args.request_rate
749-
< float("inf") else "inf")
750-
result_json["burstiness"] = args.burstiness
751-
result_json["max_concurrency"] = args.max_concurrency
752-
753-
# Merge with benchmark result
754-
result_json = {**result_json, **benchmark_result}
755-
756755
# Save to file
757756
base_model_id = model_id.split("/")[-1]
758757
max_concurrency_str = (f"-concurrency{args.max_concurrency}"
@@ -762,7 +761,12 @@ def main(args: argparse.Namespace):
762761
file_name = args.result_filename
763762
if args.result_dir:
764763
file_name = os.path.join(args.result_dir, file_name)
765-
with open(file_name, "w", encoding='utf-8') as outfile:
764+
with open(file_name,
765+
mode="a+" if args.append_result else "w",
766+
encoding='utf-8') as outfile:
767+
# Append a newline.
768+
if args.append_result and outfile.tell() != 0:
769+
outfile.write("\n")
766770
json.dump(result_json, outfile)
767771
save_to_pytorch_benchmark_format(args, result_json, file_name)
768772

@@ -894,6 +898,11 @@ def main(args: argparse.Namespace):
894898
help="When saving the results, whether to include per request "
895899
"information such as response, error, ttfs, tpots, etc.",
896900
)
901+
parser.add_argument(
902+
"--append-result",
903+
action="store_true",
904+
help="Append the benchmark result to the existing json file.",
905+
)
897906
parser.add_argument(
898907
"--metadata",
899908
metavar="KEY=VALUE",

benchmarks/benchmark_serving_structured_output.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
except ImportError:
5252
from argparse import ArgumentParser as FlexibleArgumentParser
5353

54-
from vllm.v1.structured_output.utils import (
54+
from vllm.v1.structured_output.backend_xgrammar import (
5555
has_xgrammar_unsupported_json_features)
5656

5757
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -150,17 +150,17 @@ def get_schema(index: int):
150150

151151
elif args.dataset == "grammar":
152152
schema = """
153-
?start: select_statement
153+
root ::= select_statement
154154
155-
?select_statement: "SELECT " column_list " FROM " table_name
155+
select_statement ::= "SELECT " column " from " table " where " condition
156156
157-
?column_list: column_name ("," column_name)*
157+
column ::= "col_1 " | "col_2 "
158158
159-
?table_name: identifier
159+
table ::= "table_1 " | "table_2 "
160160
161-
?column_name: identifier
161+
condition ::= column "= " number
162162
163-
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
163+
number ::= "1 " | "2 "
164164
"""
165165
prompt = "Generate an SQL query to show the 'username' \
166166
and 'email' from the 'users' table."

benchmarks/kernels/benchmark_lora.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,14 @@
1717
from utils import ArgPool, Bench, CudaGraphBenchParams
1818
from weight_shapes import WEIGHT_SHAPES
1919

20-
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
21-
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
20+
from vllm.triton_utils import HAS_TRITON
21+
22+
if HAS_TRITON:
23+
from vllm.lora.ops.triton_ops import (LoRAKernelMeta, lora_expand,
24+
lora_shrink)
25+
from vllm.lora.ops.triton_ops.utils import (_LORA_A_PTR_DICT,
26+
_LORA_B_PTR_DICT)
27+
2228
from vllm.utils import FlexibleArgumentParser
2329

2430
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())

cmake/external_projects/vllm_flash_attn.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ else()
3838
FetchContent_Declare(
3939
vllm-flash-attn
4040
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
41-
GIT_TAG 0a721daebe4fa7149f06ecf3d3eabeb6dcd0f1fa
41+
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
4242
GIT_PROGRESS TRUE
4343
# Don't share the vllm-flash-attn build between build types
4444
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

docker/Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,9 @@ ENV UV_HTTP_TIMEOUT=500
162162
COPY requirements/lint.txt requirements/lint.txt
163163
COPY requirements/test.txt requirements/test.txt
164164
COPY requirements/dev.txt requirements/dev.txt
165+
# Workaround for #17068
166+
RUN --mount=type=cache,target=/root/.cache/uv \
167+
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
165168
RUN --mount=type=cache,target=/root/.cache/uv \
166169
uv pip install --system -r requirements/dev.txt
167170
#################### DEV IMAGE ####################
@@ -265,6 +268,9 @@ ADD . /vllm-workspace/
265268
ENV UV_HTTP_TIMEOUT=500
266269

267270
# install development dependencies (for testing)
271+
# Workaround for #17068
272+
RUN --mount=type=cache,target=/root/.cache/uv \
273+
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation
268274
RUN --mount=type=cache,target=/root/.cache/uv \
269275
uv pip install --system -r requirements/dev.txt
270276

0 commit comments

Comments
 (0)