vllm-project
diff --git a/‎.buildkite/release-pipeline.yaml
Lines changed: 18 additions & 1 deletion b/‎.buildkite/release-pipeline.yaml
Lines changed: 18 additions & 1 deletion
diff --git a/‎.buildkite/scripts/annotate-release.sh
Lines changed: 31 additions & 0 deletions b/‎.buildkite/scripts/annotate-release.sh
Lines changed: 31 additions & 0 deletions
diff --git a/‎.buildkite/scripts/ci-clean-log.sh
Lines changed: 17 additions & 0 deletions b/‎.buildkite/scripts/ci-clean-log.sh
Lines changed: 17 additions & 0 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Lines changed: 2 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh
Lines changed: 19 additions & 10 deletions b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh
Lines changed: 19 additions & 10 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 2 additions & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-neuron-test.sh
Lines changed: 2 additions & 1 deletion
diff --git a/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Lines changed: 1 addition & 1 deletion b/‎.buildkite/scripts/hardware_ci/run-tpu-v1-test.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/scripts/rerun-test.sh
Lines changed: 18 additions & 0 deletions b/‎.buildkite/scripts/rerun-test.sh
Lines changed: 18 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/cleanup_docker.sh
Lines changed: 24 additions & 0 deletions b/‎.buildkite/scripts/tpu/cleanup_docker.sh
Lines changed: 24 additions & 0 deletions
diff --git a/‎.buildkite/scripts/tpu/config_v6e_1.env
Lines changed: 14 additions & 0 deletions b/‎.buildkite/scripts/tpu/config_v6e_1.env
Lines changed: 14 additions & 0 deletions
@@ -1,5 +1,6 @@
 steps:
   - label: "Build wheel - CUDA 12.8"
+    id: build-wheel-cuda-12-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -11,6 +12,7 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - label: "Build wheel - CUDA 12.6"
+    id: build-wheel-cuda-12-6
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -28,6 +30,7 @@ steps:
 
   - label: "Build wheel - CUDA 11.8"
     # depends_on: block-build-cu118-wheel
+    id: build-wheel-cuda-11-8
     agents:
       queue: cpu_queue_postmerge
     commands:
@@ -44,13 +47,26 @@ steps:
 
   - label: "Build release image"
     depends_on: block-release-image-build
+    id: build-release-image
     agents:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 
+  - label: "Annotate release workflow"
+    depends_on:
+      - build-release-image
+      - build-wheel-cuda-12-8
+      - build-wheel-cuda-12-6
+      - build-wheel-cuda-11-8
+    id: annotate-release-workflow
+    agents:
+      queue: cpu_queue_postmerge
+    commands:
+      - "bash .buildkite/scripts/annotate-release.sh"
+
   - label: "Build and publish TPU release image"
     depends_on: ~
     if: build.env("NIGHTLY") == "1"
@@ -70,9 +86,10 @@ steps:
       DOCKER_BUILDKIT: "1"
 
   - input: "Provide Release version here"
+    id: input-release-version
     fields:
       - text: "What is the release version?"
-        key: "release-version"
+        key: release-version
 
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
 
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+# Get release version and strip leading 'v' if present
+RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
+
+if [ -z "$RELEASE_VERSION" ]; then
+  echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
+  exit 1
+fi
+
+buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
+To download the wheel:
+\`\`\`
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
+aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
+\`\`\`
+
+To download and upload the image:
+
+\`\`\`
+docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
+docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
+docker tag vllm/vllm-openai vllm/vllm-openai:latest
+docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
+docker push vllm/vllm-openai:latest
+docker push vllm/vllm-openai:v${RELEASE_VERSION}
+\`\`\`
+EOF 
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Usage: ./ci_clean_log.sh ci.log
+# This script strips timestamps and color codes from CI log files.
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 ci.log"
+    exit 1
+fi
+
+INPUT_FILE="$1"
+
+# Strip timestamps
+sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
+
+# Strip colorization
+sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
@@ -7,6 +7,7 @@ set -ex
 # Setup cleanup
 remove_docker_container() {
   if [[ -n "$container_id" ]]; then
+      podman stop --all -t0
       podman rm -f "$container_id" || true
   fi
   podman system prune -f
@@ -37,7 +38,7 @@ function cpu_tests() {
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
-    pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
+    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 }
 
 # All of CPU tests are expected to be finished less than 40 mins.
 
@@ -6,6 +6,7 @@ set -ex
 
 # allow to bind to different cores
 CORE_RANGE=${CORE_RANGE:-48-95}
+OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 export CMAKE_BUILD_PARALLEL_LEVEL=32
@@ -23,15 +24,22 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
 numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
- --cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
+docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
 
 function cpu_tests() {
   set -e
   export NUMA_NODE=$2
 
+  # list packages
+  docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
+    set -e
+    pip list"
+
+  docker exec cpu-test-"$NUMA_NODE" bash -c "
+    set -e
+    pip list"
+
   # offline inference
   docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
     set -e
@@ -44,7 +52,10 @@ function cpu_tests() {
     pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
     pytest -v -s tests/models/language/generation -m cpu_model
     pytest -v -s tests/models/language/pooling -m cpu_model
-    pytest -v -s tests/models/multimodal/generation --ignore=tests/models/multimodal/generation/test_mllama.py -m cpu_model"
+    pytest -v -s tests/models/multimodal/generation \
+                --ignore=tests/models/multimodal/generation/test_mllama.py \
+                --ignore=tests/models/multimodal/generation/test_pixtral.py \
+                -m cpu_model"
 
   # Run compressed-tensor test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
@@ -56,7 +67,7 @@ function cpu_tests() {
   # Run AWQ test
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    pytest -s -v \
+    VLLM_USE_V1=0 pytest -s -v \
     tests/quantization/test_ipex_quant.py"
 
   # Run chunked-prefill and prefix-cache test
@@ -68,11 +79,9 @@ function cpu_tests() {
   # online serving
   docker exec cpu-test-"$NUMA_NODE" bash -c "
     set -e
-    export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$1
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-    python3 benchmarks/benchmark_serving.py \
+    VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
       --backend vllm \
       --dataset-name random \
       --model facebook/opt-125m \
@@ -89,4 +98,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 40 mins.
 export -f cpu_tests
-timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
+timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
@@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
        --name "${container_name}" \
        ${image_name} \
        /bin/bash -c "
+            set -e; # Exit on first error
             python3 /workspace/vllm/examples/offline_inference/neuron.py;
             python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
             for f in /workspace/vllm/tests/neuron/2_core/*.py; do
-                echo 'Running test file: '$f;
+                echo \"Running test file: \$f\";
                 python3 -m pytest \$f -v --capture=tee-sys;
             done
        "
@@ -150,7 +150,7 @@ run_and_track_test 9 "test_multimodal.py" \
 run_and_track_test 10 "test_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
 run_and_track_test 11 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py"
+    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 12 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 13 "test_lora.py" \
 
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Usage: ./rerun_test.sh path/to/test.py::test_name
+
+# Check if argument is given
+if [ $# -lt 1 ]; then
+    echo "Usage: $0 path/to/test.py::test_name"
+    echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
+    exit 1
+fi
+
+TEST=$1
+COUNT=1
+
+while pytest -sv "$TEST"; do
+    COUNT=$((COUNT + 1))
+    echo "RUN NUMBER ${COUNT}"
+done
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -euo pipefail
+
+docker_root=$(docker info -f '{{.DockerRootDir}}')
+if [ -z "$docker_root" ]; then
+  echo "Failed to determine Docker root directory."
+  exit 1
+fi
+echo "Docker root directory: $docker_root"
+# Check disk usage of the filesystem where Docker's root directory is located
+disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+# Define the threshold
+threshold=70
+if [ "$disk_usage" -gt "$threshold" ]; then
+  echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+  # Remove dangling images (those that are not tagged and not used by any container)
+  docker image prune -f
+  # Remove unused volumes / force the system prune for old images as well.
+  docker volume prune -f && docker system prune --force --filter "until=72h" --all
+  echo "Docker images and volumes cleanup completed."
+else
+  echo "Disk usage is below $threshold%. No cleanup needed."
+fi
@@ -0,0 +1,14 @@
+# Environment config
+TEST_NAME=llama8b
+CONTAINER_NAME=vllm-tpu
+
+# vllm config
+MODEL=meta-llama/Llama-3.1-8B-Instruct
+MAX_NUM_SEQS=512
+MAX_NUM_BATCHED_TOKENS=512
+TENSOR_PARALLEL_SIZE=1
+MAX_MODEL_LEN=2048
+DOWNLOAD_DIR=/mnt/disks/persist
+EXPECTED_THROUGHPUT=8.0
+INPUT_LEN=1800
+OUTPUT_LEN=128