|
2 | 2 |
|
3 | 3 | set -xu
|
4 | 4 |
|
5 |
| -# Build the docker image. |
6 |
| -docker build -f docker/Dockerfile.tpu -t vllm-tpu . |
7 | 5 |
|
8 |
| -# Set up cleanup. |
9 |
| -remove_docker_container() { docker rm -f tpu-test || true; } |
| 6 | +remove_docker_container() { |
| 7 | + docker rm -f tpu-test || true; |
| 8 | + docker rm -f vllm-tpu || true; |
| 9 | +} |
| 10 | + |
10 | 11 | trap remove_docker_container EXIT
|
| 12 | + |
11 | 13 | # Remove the container that might not be cleaned up in the previous run.
|
12 | 14 | remove_docker_container
|
13 | 15 |
|
| 16 | +# Build the docker image. |
| 17 | +docker build -f docker/Dockerfile.tpu -t vllm-tpu . |
| 18 | + |
| 19 | +# Set up cleanup. |
| 20 | +cleanup_docker() { |
| 21 | + # Get Docker's root directory |
| 22 | + docker_root=$(docker info -f '{{.DockerRootDir}}') |
| 23 | + if [ -z "$docker_root" ]; then |
| 24 | + echo "Failed to determine Docker root directory." |
| 25 | + exit 1 |
| 26 | + fi |
| 27 | + echo "Docker root directory: $docker_root" |
| 28 | + # Check disk usage of the filesystem where Docker's root directory is located |
| 29 | + disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//') |
| 30 | + # Define the threshold |
| 31 | + threshold=70 |
| 32 | + if [ "$disk_usage" -gt "$threshold" ]; then |
| 33 | + echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." |
| 34 | + # Remove dangling images (those that are not tagged and not used by any container) |
| 35 | + docker image prune -f |
| 36 | + # Remove unused volumes / force the system prune for old images as well. |
| 37 | + docker volume prune -f && docker system prune --force --filter "until=72h" --all |
| 38 | + echo "Docker images and volumes cleanup completed." |
| 39 | + else |
| 40 | + echo "Disk usage is below $threshold%. No cleanup needed." |
| 41 | + fi |
| 42 | +} |
| 43 | +cleanup_docker |
| 44 | + |
14 | 45 | # For HF_TOKEN.
|
15 | 46 | source /etc/environment
|
16 |
| -# Run a simple end-to-end example. |
| 47 | + |
17 | 48 | docker run --privileged --net host --shm-size=16G -it \
|
18 | 49 | -e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
19 |
| - vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \ |
20 |
| - && python3 -m pip install pytest pytest-asyncio tpu-info \ |
21 |
| - && python3 -m pip install lm_eval[api]==0.4.4 \ |
22 |
| - && export VLLM_XLA_CACHE_PATH= \ |
23 |
| - && export VLLM_USE_V1=1 \ |
24 |
| - && export VLLM_XLA_CHECK_RECOMPILATION=1 \ |
25 |
| - && echo HARDWARE \ |
26 |
| - && tpu-info \ |
27 |
| - && { \ |
28 |
| - echo TEST_0: Running test_perf.py; \ |
29 |
| - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \ |
30 |
| - echo TEST_0_EXIT_CODE: \$?; \ |
31 |
| - } & \ |
32 |
| - { \ |
33 |
| - echo TEST_1: Running test_compilation.py; \ |
34 |
| - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \ |
35 |
| - echo TEST_1_EXIT_CODE: \$?; \ |
36 |
| - } & \ |
37 |
| - { \ |
38 |
| - echo TEST_2: Running test_basic.py; \ |
39 |
| - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \ |
40 |
| - echo TEST_2_EXIT_CODE: \$?; \ |
41 |
| - } & \ |
42 |
| - { \ |
43 |
| - echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ |
44 |
| - python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \ |
45 |
| - echo TEST_3_EXIT_CODE: \$?; \ |
46 |
| - } & \ |
47 |
| - { \ |
48 |
| - echo TEST_4: Running test_quantization_accuracy.py; \ |
49 |
| - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \ |
50 |
| - echo TEST_4_EXIT_CODE: \$?; \ |
51 |
| - } & \ |
52 |
| - { \ |
53 |
| - echo TEST_5: Running examples/offline_inference/tpu.py; \ |
54 |
| - python3 /workspace/vllm/examples/offline_inference/tpu.py; \ |
55 |
| - echo TEST_5_EXIT_CODE: \$?; \ |
56 |
| - } & \ |
57 |
| - { \ |
58 |
| - echo TEST_6: Running test_tpu_model_runner.py; \ |
59 |
| - python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \ |
60 |
| - echo TEST_6_EXIT_CODE: \$?; \ |
61 |
| - } & \ |
62 |
| - { \ |
63 |
| - echo TEST_7: Running test_sampler.py; \ |
64 |
| - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \ |
65 |
| - echo TEST_7_EXIT_CODE: \$?; \ |
66 |
| - } & \ |
67 |
| - { \ |
68 |
| - echo TEST_8: Running test_topk_topp_sampler.py; \ |
69 |
| - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \ |
70 |
| - echo TEST_8_EXIT_CODE: \$?; \ |
71 |
| - } & \ |
72 |
| - { \ |
73 |
| - echo TEST_9: Running test_multimodal.py; \ |
74 |
| - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \ |
75 |
| - echo TEST_9_EXIT_CODE: \$?; \ |
76 |
| - } & \ |
77 |
| - { \ |
78 |
| - echo TEST_10: Running test_pallas.py; \ |
79 |
| - python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \ |
80 |
| - echo TEST_10_EXIT_CODE: \$?; \ |
81 |
| - } & \ |
82 |
| - { \ |
83 |
| - echo TEST_11: Running test_struct_output_generate.py; \ |
84 |
| - python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \ |
85 |
| - echo TEST_11_EXIT_CODE: \$?; \ |
86 |
| - } & \ |
87 |
| - { \ |
88 |
| - echo TEST_12: Running test_moe_pallas.py; \ |
89 |
| - python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \ |
90 |
| - echo TEST_12_EXIT_CODE: \$?; \ |
91 |
| - } & \ |
92 |
| - # Disable the TPU LoRA tests until the feature is activated |
93 |
| - # & { \ |
94 |
| - # echo TEST_13: Running test_moe_pallas.py; \ |
95 |
| - # python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \ |
96 |
| - # echo TEST_13_EXIT_CODE: \$?; \ |
97 |
| - # } & \ |
98 |
| - wait \ |
99 |
| - && echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \ |
100 |
| -" |
| 50 | + vllm-tpu /bin/bash -c ' |
| 51 | +set -e # Exit immediately if a command exits with a non-zero status. |
| 52 | +set -u # Treat unset variables as an error. |
| 53 | +
|
| 54 | +echo "--- Starting script inside Docker container ---" |
| 55 | +
|
| 56 | +# Create results directory |
| 57 | +RESULTS_DIR=$(mktemp -d) |
| 58 | +# If mktemp fails, set -e will cause the script to exit. |
| 59 | +echo "Results will be stored in: $RESULTS_DIR" |
| 60 | +
|
| 61 | +# Install dependencies |
| 62 | +echo "--- Installing Python dependencies ---" |
| 63 | +python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \ |
| 64 | + && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \ |
| 65 | + && python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 |
| 66 | +echo "--- Python dependencies installed ---" |
| 67 | +export VLLM_USE_V1=1 |
| 68 | +export VLLM_XLA_CHECK_RECOMPILATION=1 |
| 69 | +export VLLM_XLA_CACHE_PATH= |
| 70 | +echo "Using VLLM V1" |
| 71 | +
|
| 72 | +echo "--- Hardware Information ---" |
| 73 | +tpu-info |
| 74 | +echo "--- Starting Tests ---" |
| 75 | +set +e |
| 76 | +overall_script_exit_code=0 |
| 77 | +
|
| 78 | +# --- Test Definitions --- |
| 79 | +# If a test fails, this function will print logs and will not cause the main script to exit. |
| 80 | +run_test() { |
| 81 | + local test_num=$1 |
| 82 | + local test_name=$2 |
| 83 | + local test_command=$3 |
| 84 | + local log_file="$RESULTS_DIR/test_${test_num}.log" |
| 85 | + local actual_exit_code |
| 86 | +
|
| 87 | + echo "--- TEST_$test_num: Running $test_name ---" |
| 88 | + |
| 89 | + # Execute the test command. |
| 90 | + eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2) |
| 91 | + actual_exit_code=$? |
| 92 | +
|
| 93 | + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log |
| 94 | + echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log |
| 95 | +
|
| 96 | + if [ "$actual_exit_code" -ne 0 ]; then |
| 97 | + echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2 |
| 98 | + echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2 |
| 99 | + if [ -f "$log_file" ]; then |
| 100 | + cat "$log_file" >&2 |
| 101 | + else |
| 102 | + echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2 |
| 103 | + fi |
| 104 | + echo "--- End of log for TEST_$test_num ($test_name) ---" >&2 |
| 105 | + return "$actual_exit_code" # Return the failure code |
| 106 | + else |
| 107 | + echo "TEST_$test_num ($test_name) PASSED." |
| 108 | + return 0 # Return success |
| 109 | + fi |
| 110 | +} |
| 111 | +
|
| 112 | +# Helper function to call run_test and update the overall script exit code |
| 113 | +run_and_track_test() { |
| 114 | + local test_num_arg="$1" |
| 115 | + local test_name_arg="$2" |
| 116 | + local test_command_arg="$3" |
| 117 | +
|
| 118 | + # Run the test |
| 119 | + run_test "$test_num_arg" "$test_name_arg" "$test_command_arg" |
| 120 | + local test_specific_exit_code=$? |
| 121 | +
|
| 122 | + # If the test failed, set the overall script exit code to 1 |
| 123 | + if [ "$test_specific_exit_code" -ne 0 ]; then |
| 124 | + # No need for extra echo here, run_test already logged the failure. |
| 125 | + overall_script_exit_code=1 |
| 126 | + fi |
| 127 | +} |
| 128 | +
|
| 129 | +# --- Actual Test Execution --- |
| 130 | +run_and_track_test 0 "test_perf.py" \ |
| 131 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py" |
| 132 | +run_and_track_test 1 "test_compilation.py" \ |
| 133 | + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py" |
| 134 | +run_and_track_test 2 "test_basic.py" \ |
| 135 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py" |
| 136 | +run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \ |
| 137 | + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine" |
| 138 | +run_and_track_test 4 "test_quantization_accuracy.py" \ |
| 139 | + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py" |
| 140 | +run_and_track_test 5 "examples/offline_inference/tpu.py" \ |
| 141 | + "python3 /workspace/vllm/examples/offline_inference/tpu.py" |
| 142 | +run_and_track_test 6 "test_tpu_model_runner.py" \ |
| 143 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py" |
| 144 | +run_and_track_test 7 "test_sampler.py" \ |
| 145 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" |
| 146 | +run_and_track_test 8 "test_topk_topp_sampler.py" \ |
| 147 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py" |
| 148 | +run_and_track_test 9 "test_multimodal.py" \ |
| 149 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py" |
| 150 | +run_and_track_test 10 "test_pallas.py" \ |
| 151 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py" |
| 152 | +run_and_track_test 11 "test_struct_output_generate.py" \ |
| 153 | + "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" |
| 154 | +run_and_track_test 12 "test_moe_pallas.py" \ |
| 155 | + "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" |
| 156 | +run_and_track_test 13 "test_lora.py" \ |
| 157 | + "VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py" |
| 158 | +
|
| 159 | +# After all tests have been attempted, exit with the overall status. |
| 160 | +if [ "$overall_script_exit_code" -ne 0 ]; then |
| 161 | + echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---" |
| 162 | +else |
| 163 | + echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---" |
| 164 | +fi |
| 165 | +exit "$overall_script_exit_code" |
| 166 | +' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct. |
| 167 | + |
| 168 | +# Capture the exit code of the docker run command |
| 169 | +DOCKER_RUN_EXIT_CODE=$? |
101 | 170 |
|
| 171 | +# The trap will run for cleanup. |
| 172 | +# Exit the main script with the Docker run command's exit code. |
| 173 | +if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then |
| 174 | + echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE." |
| 175 | + exit "$DOCKER_RUN_EXIT_CODE" |
| 176 | +else |
| 177 | + echo "Docker run command completed successfully." |
| 178 | + exit 0 |
| 179 | +fi |
102 | 180 | # TODO: This test fails because it uses RANDOM_SEED sampling
|
103 |
| -# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ |
| 181 | +# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \ |
0 commit comments