Skip to content

Commit 81fccb0

Browse files
committed
Merge remote-tracking branch upstream/main into vlm-transformers
2 parents d1e6d95 + ae9c4d4 commit 81fccb0

File tree

573 files changed

+39461
-8790
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

573 files changed

+39461
-8790
lines changed

.buildkite/nightly-benchmarks/nightly-annotation.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Please download the visualization scripts in the post
1616
- Download `nightly-benchmarks.zip`.
1717
- In the same folder, run the following code:
1818

19-
```console
19+
```bash
2020
export HF_TOKEN=<your HF token>
2121
apt update
2222
apt install -y git

.buildkite/release-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ steps:
102102
commands:
103103
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
104104
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
105+
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
105106
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
106107
env:
107108
DOCKER_BUILDKIT: "1"
@@ -117,6 +118,7 @@ steps:
117118
commands:
118119
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
119120
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
121+
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
120122
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
121123
env:
122124
DOCKER_BUILDKIT: "1"

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ function cpu_tests() {
5151
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
5252
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
5353
pytest -v -s tests/models/language/generation -m cpu_model
54+
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
5455
pytest -v -s tests/models/language/pooling -m cpu_model
5556
pytest -v -s tests/models/multimodal/generation \
5657
--ignore=tests/models/multimodal/generation/test_mllama.py \
@@ -98,4 +99,4 @@ function cpu_tests() {
9899

99100
# All of CPU tests are expected to be finished less than 40 mins.
100101
export -f cpu_tests
101-
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
102+
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/scripts/hardware_ci/run-hpu-test.sh

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,34 @@
22

33
# This script build the CPU docker image and run the offline inference inside the container.
44
# It serves a sanity check for compilation and basic model usage.
5-
set -ex
5+
set -exuo pipefail
66

77
# Try building the docker image
8-
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
8+
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
9+
FROM 1.22-413-pt2.7.1:latest
10+
11+
COPY ./ /workspace/vllm
12+
13+
WORKDIR /workspace/vllm
14+
15+
RUN pip install -v -r requirements/hpu.txt
16+
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
17+
18+
ENV no_proxy=localhost,127.0.0.1
19+
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
20+
21+
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
22+
23+
# install development dependencies (for testing)
24+
RUN python3 -m pip install -e tests/vllm_test_utils
25+
26+
WORKDIR /workspace/
27+
28+
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
29+
30+
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
31+
32+
EOF
933

1034
# Setup cleanup
1135
# certain versions of HPU software stack have a bug that can
@@ -14,13 +38,21 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu .
1438
# functions, while other platforms only need one remove_docker_container
1539
# function.
1640
EXITCODE=1
17-
remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
18-
remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
19-
trap remove_docker_containers_and_exit EXIT
41+
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
42+
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
2043
remove_docker_containers
2144

22-
# Run the image and launch offline inference
23-
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
24-
docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
45+
echo "Running HPU plugin v1 test"
46+
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
47+
-e HABANA_VISIBLE_DEVICES=all \
48+
hpu-plugin-v1-test-env \
49+
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
2550

2651
EXITCODE=$?
52+
if [ $EXITCODE -eq 0 ]; then
53+
echo "Test with basic model passed"
54+
else
55+
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
56+
fi
57+
58+
# The trap will handle the container removal and final exit.

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
5454
--name "${container_name}" \
5555
${image_name} \
5656
/bin/bash -c "
57+
set -e; # Exit on first error
5758
python3 /workspace/vllm/examples/offline_inference/neuron.py;
5859
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
5960
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
60-
echo 'Running test file: '$f;
61+
echo \"Running test file: \$f\";
6162
python3 -m pytest \$f -v --capture=tee-sys;
6263
done
6364
"

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,8 @@ run_and_track_test 14 "test_tpu_qkv_linear.py" \
159159
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
160160
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
161161
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
162+
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
163+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
162164
163165
# After all tests have been attempted, exit with the overall status.
164166
if [ "$overall_script_exit_code" -ne 0 ]; then

.buildkite/scripts/hardware_ci/run-xpu-test.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,5 @@ docker run \
2828
sh -c '
2929
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
3030
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
31+
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
3132
'

.buildkite/scripts/tpu/config_v6e_1.env

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@ CONTAINER_NAME=vllm-tpu
44

55
# vllm config
66
MODEL=meta-llama/Llama-3.1-8B-Instruct
7-
MAX_NUM_SEQS=512
8-
MAX_NUM_BATCHED_TOKENS=512
7+
MAX_NUM_SEQS=256
8+
MAX_NUM_BATCHED_TOKENS=1024
99
TENSOR_PARALLEL_SIZE=1
1010
MAX_MODEL_LEN=2048
1111
DOWNLOAD_DIR=/mnt/disks/persist

.buildkite/scripts/tpu/docker_run_bm.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ docker run \
6868

6969
echo "run script..."
7070
echo
71-
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/hardware_ci/run_bm.sh"
71+
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
7272

7373
echo "copy result back..."
7474
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt

.buildkite/test-pipeline.yaml

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,16 @@ steps:
4141
# TODO: add `--strict` once warnings in docstrings are fixed
4242
- mkdocs build
4343

44+
- label: Pytorch Nightly Dependency Override Check # 2min
45+
# if this test fails, it means the nightly torch version is not compatible with some
46+
# of the dependencies. Please check the error message and add the package to whitelist
47+
# in /vllm/tools/generate_nightly_torch_test.py
48+
soft_fail: true
49+
source_file_dependencies:
50+
- requirements/nightly_torch_test.txt
51+
commands:
52+
- bash standalone_tests/pytorch_nightly_dependency.sh
53+
4454
- label: Async Engine, Inputs, Utils, Worker Test # 24min
4555
mirror_hardwares: [amdexperimental]
4656
source_file_dependencies:
@@ -89,7 +99,7 @@ steps:
8999
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
90100

91101
- label: Chunked Prefill Test
92-
mirror_hardwares: [amdexperimental]
102+
mirror_hardwares: [amdexperimental, amdproduction]
93103
source_file_dependencies:
94104
- vllm/
95105
- tests/basic_correctness/test_chunked_prefill
@@ -168,6 +178,23 @@ steps:
168178
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
169179
- popd
170180

181+
- label: EPLB Algorithm Test
182+
working_dir: "/vllm-workspace/tests"
183+
source_file_dependencies:
184+
- vllm/distributed/eplb
185+
- tests/distributed/test_eplb_algo.py
186+
commands:
187+
- pytest -v -s distributed/test_eplb_algo.py
188+
189+
- label: EPLB Execution Test # 5min
190+
working_dir: "/vllm-workspace/tests"
191+
num_gpus: 4
192+
source_file_dependencies:
193+
- vllm/distributed/eplb
194+
- tests/distributed/test_eplb_execute.py
195+
commands:
196+
- pytest -v -s distributed/test_eplb_execute.py
197+
171198
- label: Metrics, Tracing Test # 10min
172199
mirror_hardwares: [amdexperimental, amdproduction]
173200
num_gpus: 2
@@ -271,6 +298,15 @@ steps:
271298
commands:
272299
- pytest -v -s prefix_caching
273300

301+
302+
- label: Platform Tests (CUDA)
303+
mirror_hardwares: [amdexperimental]
304+
source_file_dependencies:
305+
- vllm/
306+
- tests/cuda
307+
commands:
308+
- pytest -v -s cuda/test_cuda_context.py
309+
274310
- label: Samplers Test # 36min
275311
mirror_hardwares: [amdexperimental]
276312
source_file_dependencies:
@@ -500,6 +536,17 @@ steps:
500536
- pip freeze | grep -E 'torch'
501537
- pytest -v -s models/language -m core_model
502538

539+
- label: Language Models Test (Hybrid) # 35 min
540+
mirror_hardwares: [amdexperimental]
541+
torch_nightly: true
542+
source_file_dependencies:
543+
- vllm/
544+
- tests/models/language/generation
545+
commands:
546+
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
547+
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
548+
- pytest -v -s models/language/generation -m hybrid_model
549+
503550
- label: Language Models Test (Extended Generation) # 1hr20min
504551
mirror_hardwares: [amdexperimental]
505552
optional: true
@@ -509,7 +556,7 @@ steps:
509556
commands:
510557
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
511558
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
512-
- pytest -v -s models/language/generation -m 'not core_model'
559+
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
513560

514561
- label: Language Models Test (Extended Pooling) # 36min
515562
mirror_hardwares: [amdexperimental]
@@ -606,13 +653,18 @@ steps:
606653
- vllm/executor/
607654
- vllm/model_executor/models/
608655
- tests/distributed/
656+
- tests/examples/offline_inference/data_parallel.py
609657
commands:
610658
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
611659
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
660+
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
661+
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
612662
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
613663
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
614664
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
615665
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
666+
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
667+
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
616668

617669
- label: Distributed Tests (2 GPUs) # 40min
618670
mirror_hardwares: [amdexperimental]
@@ -736,7 +788,7 @@ steps:
736788
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
737789

738790
- label: Weight Loading Multiple GPU Test - Large Models # optional
739-
mirror_hardwares: [amdexperimental]
791+
mirror_hardwares: [amdexperimental]
740792
working_dir: "/vllm-workspace/tests"
741793
num_gpus: 2
742794
gpu: a100

0 commit comments

Comments
 (0)