Skip to content

Commit addaa2e

Browse files
committed
Merge branch 'main' into kv-xfer-updates
2 parents 6eb01a5 + aed2bca commit addaa2e

File tree

329 files changed

+11769
-3796
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

329 files changed

+11769
-3796
lines changed

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2424
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2525

2626
# Run the image, setting --shm-size=4g for tensor parallel.
27-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
27+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
2929

3030
function cpu_tests() {
3131
set -e
3232
export NUMA_NODE=$2
3333

34+
# list packages
35+
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
36+
set -e
37+
pip list"
38+
39+
docker exec cpu-test-"$NUMA_NODE" bash -c "
40+
set -e
41+
pip list"
42+
3443
# offline inference
3544
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
3645
set -e
@@ -72,7 +81,7 @@ function cpu_tests() {
7281
set -e
7382
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
7483
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
75-
python3 benchmarks/benchmark_serving.py \
84+
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
7685
--backend vllm \
7786
--dataset-name random \
7887
--model facebook/opt-125m \

.buildkite/test-pipeline.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,11 @@ steps:
177177
- tests/tracing
178178
commands:
179179
- pytest -v -s metrics
180+
- "pip install \
181+
'opentelemetry-sdk>=1.26.0' \
182+
'opentelemetry-api>=1.26.0' \
183+
'opentelemetry-exporter-otlp>=1.26.0' \
184+
'opentelemetry-semantic-conventions-ai>=0.4.1'"
180185
- pytest -v -s tracing
181186

182187
##### fast check tests #####
@@ -305,6 +310,7 @@ steps:
305310
commands:
306311
- pytest -v -s compile/test_pass_manager.py
307312
- pytest -v -s compile/test_fusion.py
313+
- pytest -v -s compile/test_fusion_attn.py
308314
- pytest -v -s compile/test_silu_mul_quant_fusion.py
309315
- pytest -v -s compile/test_sequence_parallelism.py
310316
- pytest -v -s compile/test_async_tp.py
@@ -669,7 +675,7 @@ steps:
669675
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
670676

671677
- label: Multi-step Tests (4 GPUs) # 36min
672-
mirror_hardwares: [amdexperimental]
678+
mirror_hardwares: [amdexperimental, amdproduction]
673679
working_dir: "/vllm-workspace/tests"
674680
num_gpus: 4
675681
source_file_dependencies:

.github/mergify.yml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,21 @@ pull_request_rules:
6565
add:
6666
- multi-modality
6767

68+
- name: label-qwen
69+
description: Automatically apply qwen label
70+
conditions:
71+
- or:
72+
- files~=^examples/.*qwen.*\.py
73+
- files~=^tests/.*qwen.*\.py
74+
- files~=^vllm/model_executor/models/.*qwen.*\.py
75+
- files~=^vllm/reasoning/.*qwen.*\.py
76+
- title~=(?i)Qwen
77+
- body~=(?i)Qwen
78+
actions:
79+
label:
80+
add:
81+
- qwen
82+
6883
- name: label-rocm
6984
description: Automatically apply rocm label
7085
conditions:

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,5 +200,5 @@ benchmarks/**/*.json
200200
actionlint
201201
shellcheck*/
202202

203-
# Ingore moe/marlin_moe gen code
203+
# Ignore moe/marlin_moe gen code
204204
csrc/moe/marlin_moe_wna16/kernel_*

.pre-commit-config.yaml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,10 @@ repos:
2020
args: [--output-format, github, --fix]
2121
- id: ruff-format
2222
files: ^(.buildkite|benchmarks|examples)/.*
23-
- repo: https://github.com/codespell-project/codespell
24-
rev: v2.4.1
23+
- repo: https://github.com/crate-ci/typos
24+
rev: v1.32.0
2525
hooks:
26-
- id: codespell
27-
additional_dependencies: ['tomli']
28-
args: ['--toml', 'pyproject.toml']
26+
- id: typos
2927
- repo: https://github.com/PyCQA/isort
3028
rev: 6.0.1
3129
hooks:
@@ -145,6 +143,13 @@ repos:
145143
types: [python]
146144
pass_filenames: false
147145
additional_dependencies: [regex]
146+
- id: check-pickle-imports
147+
name: Prevent new pickle/cloudpickle imports
148+
entry: python tools/check_pickle_imports.py
149+
language: python
150+
types: [python]
151+
pass_filenames: false
152+
additional_dependencies: [pathspec, regex]
148153
# Keep `suggestion` last
149154
- id: suggestion
150155
name: Suggestion

CMakeLists.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
420420
endif()
421421
endif()
422422

423-
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
424-
# CUDA 12.8 or later
425-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
423+
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
424+
# require CUDA 12.8 or later
425+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
426426
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
427427
set(SRCS
428428
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
@@ -542,10 +542,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
542542

543543
# CUTLASS MoE kernels
544544

545-
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
545+
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
546546
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
547547
# if it's possible to compile MoE kernels that use its output.
548-
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
548+
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
549549
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
550550
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
551551
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
156156

157157
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
158158
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
159-
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
159+
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
160160
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
161161
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
162162

benchmarks/benchmark_latency.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
123123
save_to_pytorch_benchmark_format(args, results)
124124

125125

126-
if __name__ == "__main__":
126+
def create_argument_parser():
127127
parser = FlexibleArgumentParser(
128128
description="Benchmark the latency of processing a single batch of "
129129
"requests till completion."
@@ -171,6 +171,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
171171
# V1 enables prefix caching by default which skews the latency
172172
# numbers. We need to disable prefix caching by default.
173173
parser.set_defaults(enable_prefix_caching=False)
174+
175+
return parser
176+
177+
178+
if __name__ == "__main__":
179+
parser = create_argument_parser()
174180
args = parser.parse_args()
175181
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
176182
raise OSError(

benchmarks/benchmark_long_document_qa_throughput.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def main(args):
142142
)
143143

144144

145-
if __name__ == "__main__":
145+
def create_argument_parser():
146146
parser = FlexibleArgumentParser(
147147
description="Benchmark the performance with or "
148148
"without automatic prefix caching."
@@ -192,5 +192,11 @@ def main(args):
192192
)
193193

194194
parser = EngineArgs.add_cli_args(parser)
195+
196+
return parser
197+
198+
199+
if __name__ == "__main__":
200+
parser = create_argument_parser()
195201
args = parser.parse_args()
196202
main(args)

benchmarks/benchmark_prefix_caching.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def main(args):
218218
)
219219

220220

221-
if __name__ == "__main__":
221+
def create_argument_parser():
222222
parser = FlexibleArgumentParser(
223223
description="Benchmark the performance with or without "
224224
"automatic prefix caching."
@@ -268,5 +268,11 @@ def main(args):
268268
)
269269

270270
parser = EngineArgs.add_cli_args(parser)
271+
272+
return parser
273+
274+
275+
if __name__ == "__main__":
276+
parser = create_argument_parser()
271277
args = parser.parse_args()
272278
main(args)

0 commit comments

Comments
 (0)