Skip to content

Commit a064ccb

Browse files
committed
migrate changes
Signed-off-by: Leo Tian <leo.tian@centml.ai>
2 parents ab65e32 + 71baf85 commit a064ccb

File tree

607 files changed

+26888
-7145
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

607 files changed

+26888
-7145
lines changed

.buildkite/release-pipeline.yaml

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
steps:
22
- label: "Build wheel - CUDA 12.8"
3+
id: build-wheel-cuda-12-8
34
agents:
45
queue: cpu_queue_postmerge
56
commands:
@@ -11,6 +12,7 @@ steps:
1112
DOCKER_BUILDKIT: "1"
1213

1314
- label: "Build wheel - CUDA 12.6"
15+
id: build-wheel-cuda-12-6
1416
agents:
1517
queue: cpu_queue_postmerge
1618
commands:
@@ -28,6 +30,7 @@ steps:
2830

2931
- label: "Build wheel - CUDA 11.8"
3032
# depends_on: block-build-cu118-wheel
33+
id: build-wheel-cuda-11-8
3134
agents:
3235
queue: cpu_queue_postmerge
3336
commands:
@@ -44,13 +47,26 @@ steps:
4447

4548
- label: "Build release image"
4649
depends_on: block-release-image-build
50+
id: build-release-image
4751
agents:
4852
queue: cpu_queue_postmerge
4953
commands:
5054
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
5155
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
5256
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
5357

58+
- label: "Annotate release workflow"
59+
depends_on:
60+
- build-release-image
61+
- build-wheel-cuda-12-8
62+
- build-wheel-cuda-12-6
63+
- build-wheel-cuda-11-8
64+
id: annotate-release-workflow
65+
agents:
66+
queue: cpu_queue_postmerge
67+
commands:
68+
- "bash .buildkite/scripts/annotate-release.sh"
69+
5470
- label: "Build and publish TPU release image"
5571
depends_on: ~
5672
if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
7086
DOCKER_BUILDKIT: "1"
7187

7288
- input: "Provide Release version here"
89+
id: input-release-version
7390
fields:
7491
- text: "What is the release version?"
75-
key: "release-version"
92+
key: release-version
7693

7794
- block: "Build CPU release image"
7895
key: block-cpu-release-image-build
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
# Get release version and strip leading 'v' if present
6+
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
7+
8+
if [ -z "$RELEASE_VERSION" ]; then
9+
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
10+
exit 1
11+
fi
12+
13+
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
14+
To download the wheel:
15+
\`\`\`
16+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
17+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
18+
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
19+
\`\`\`
20+
21+
To download and upload the image:
22+
23+
\`\`\`
24+
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
25+
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
26+
docker tag vllm/vllm-openai vllm/vllm-openai:latest
27+
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
28+
docker push vllm/vllm-openai:latest
29+
docker push vllm/vllm-openai:v${RELEASE_VERSION}
30+
\`\`\`
31+
EOF

.buildkite/scripts/ci-clean-log.sh

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
# Usage: ./ci_clean_log.sh ci.log
3+
# This script strips timestamps and color codes from CI log files.
4+
5+
# Check if argument is given
6+
if [ $# -lt 1 ]; then
7+
echo "Usage: $0 ci.log"
8+
exit 1
9+
fi
10+
11+
INPUT_FILE="$1"
12+
13+
# Strip timestamps
14+
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
15+
16+
# Strip colorization
17+
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"

.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ set -ex
77
# Setup cleanup
88
remove_docker_container() {
99
if [[ -n "$container_id" ]]; then
10+
podman stop --all -t0
1011
podman rm -f "$container_id" || true
1112
fi
1213
podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
3738
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
3839
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
3940
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
40-
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
41+
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
4142
}
4243

4344
# All of CPU tests are expected to be finished less than 40 mins.

.buildkite/scripts/hardware_ci/run-cpu-test.sh

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ set -ex
66

77
# allow to bind to different cores
88
CORE_RANGE=${CORE_RANGE:-48-95}
9+
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
910
NUMA_NODE=${NUMA_NODE:-1}
1011

1112
export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,15 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
2324
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
2425

2526
# Run the image, setting --shm-size=4g for tensor parallel.
26-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
27-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28-
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
29-
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
27+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
28+
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
3029

3130
function cpu_tests() {
3231
set -e
3332
export NUMA_NODE=$2
3433

34+
# list packages
35+
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
36+
set -e
37+
pip list"
38+
39+
docker exec cpu-test-"$NUMA_NODE" bash -c "
40+
set -e
41+
pip list"
42+
3543
# offline inference
3644
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
3745
set -e
@@ -44,7 +52,10 @@ function cpu_tests() {
4452
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
4553
pytest -v -s tests/models/language/generation -m cpu_model
4654
pytest -v -s tests/models/language/pooling -m cpu_model
47-
pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
55+
pytest -v -s tests/models/multimodal/generation \
56+
--ignore=tests/models/multimodal/generation/test_mllama.py \
57+
--ignore=tests/models/multimodal/generation/test_pixtral.py \
58+
-m cpu_model"
4859

4960
# Run compressed-tensor test
5061
docker exec cpu-test-"$NUMA_NODE" bash -c "
@@ -56,7 +67,7 @@ function cpu_tests() {
5667
# Run AWQ test
5768
docker exec cpu-test-"$NUMA_NODE" bash -c "
5869
set -e
59-
pytest -s -v \
70+
VLLM_USE_V1=0 pytest -s -v \
6071
tests/quantization/test_ipex_quant.py"
6172

6273
# Run chunked-prefill and prefix-cache test
@@ -68,11 +79,9 @@ function cpu_tests() {
6879
# online serving
6980
docker exec cpu-test-"$NUMA_NODE" bash -c "
7081
set -e
71-
export VLLM_CPU_KVCACHE_SPACE=10
72-
export VLLM_CPU_OMP_THREADS_BIND=$1
7382
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
7483
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
75-
python3 benchmarks/benchmark_serving.py \
84+
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
7685
--backend vllm \
7786
--dataset-name random \
7887
--model facebook/opt-125m \
@@ -89,4 +98,4 @@ function cpu_tests() {
8998

9099
# All of CPU tests are expected to be finished less than 40 mins.
91100
export -f cpu_tests
92-
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
101+
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"

.buildkite/scripts/hardware_ci/run-neuron-test.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
5454
--name "${container_name}" \
5555
${image_name} \
5656
/bin/bash -c "
57+
set -e; # Exit on first error
5758
python3 /workspace/vllm/examples/offline_inference/neuron.py;
5859
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
5960
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
60-
echo 'Running test file: '$f;
61+
echo \"Running test file: \$f\";
6162
python3 -m pytest \$f -v --capture=tee-sys;
6263
done
6364
"

.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
150150
run_and_track_test 10 "test_pallas.py" \
151151
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
152152
run_and_track_test 11 "test_struct_output_generate.py" \
153-
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
153+
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
154154
run_and_track_test 12 "test_moe_pallas.py" \
155155
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
156156
run_and_track_test 13 "test_lora.py" \

.buildkite/scripts/rerun-test.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
# Usage: ./rerun_test.sh path/to/test.py::test_name
4+
5+
# Check if argument is given
6+
if [ $# -lt 1 ]; then
7+
echo "Usage: $0 path/to/test.py::test_name"
8+
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
9+
exit 1
10+
fi
11+
12+
TEST=$1
13+
COUNT=1
14+
15+
while pytest -sv "$TEST"; do
16+
COUNT=$((COUNT + 1))
17+
echo "RUN NUMBER ${COUNT}"
18+
done
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
docker_root=$(docker info -f '{{.DockerRootDir}}')
6+
if [ -z "$docker_root" ]; then
7+
echo "Failed to determine Docker root directory."
8+
exit 1
9+
fi
10+
echo "Docker root directory: $docker_root"
11+
# Check disk usage of the filesystem where Docker's root directory is located
12+
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
13+
# Define the threshold
14+
threshold=70
15+
if [ "$disk_usage" -gt "$threshold" ]; then
16+
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
17+
# Remove dangling images (those that are not tagged and not used by any container)
18+
docker image prune -f
19+
# Remove unused volumes / force the system prune for old images as well.
20+
docker volume prune -f && docker system prune --force --filter "until=72h" --all
21+
echo "Docker images and volumes cleanup completed."
22+
else
23+
echo "Disk usage is below $threshold%. No cleanup needed."
24+
fi
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# Environment config
2+
TEST_NAME=llama8b
3+
CONTAINER_NAME=vllm-tpu
4+
5+
# vllm config
6+
MODEL=meta-llama/Llama-3.1-8B-Instruct
7+
MAX_NUM_SEQS=512
8+
MAX_NUM_BATCHED_TOKENS=512
9+
TENSOR_PARALLEL_SIZE=1
10+
MAX_MODEL_LEN=2048
11+
DOWNLOAD_DIR=/mnt/disks/persist
12+
EXPECTED_THROUGHPUT=8.0
13+
INPUT_LEN=1800
14+
OUTPUT_LEN=128

0 commit comments

Comments
 (0)