Skip to content

Commit ee5ad8d

Browse files
authored
[Misc][Tools][Benchmark] Add profile to autotune script (#19711)
Signed-off-by: Chenyaaang <chenyangli@google.com>
1 parent a738dbb commit ee5ad8d

File tree

1 file changed

+37
-5
lines changed

1 file changed

+37
-5
lines changed

benchmarks/auto_tune.sh

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
# 3. Set variables (ALL REQUIRED)
1111
# BASE: your directory for vllm repo
1212
# MODEL: the model served by vllm
13+
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
1314
# TP: ways of tensor parallelism
1415
# DOWNLOAD_DIR: directory to download and load model weights.
1516
# INPUT_LEN: request input len
@@ -34,6 +35,7 @@
3435
TAG=$(date +"%Y_%m_%d_%H_%M")
3536
BASE=""
3637
MODEL="meta-llama/Llama-3.1-8B-Instruct"
38+
SYSTEM="TPU"
3739
TP=1
3840
DOWNLOAD_DIR=""
3941
INPUT_LEN=4000
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
4547

4648
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
4749
RESULT="$LOG_FOLDER/result.txt"
50+
PROFILE_PATH="$LOG_FOLDER/profile"
4851

4952
echo "result file: $RESULT"
5053
echo "model: $MODEL"
5154

5255
rm -rf $LOG_FOLDER
56+
rm -rf $PROFILE_PATH
5357
mkdir -p $LOG_FOLDER
58+
mkdir -p $PROFILE_PATH
5459

5560
cd "$BASE/vllm"
5661

@@ -70,10 +75,11 @@ start_server() {
7075
local max_num_seqs=$2
7176
local max_num_batched_tokens=$3
7277
local vllm_log=$4
78+
local profile_dir=$5
7379

7480
pkill -f vllm
7581

76-
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
82+
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
7783
--disable-log-requests \
7884
--port 8004 \
7985
--gpu-memory-utilization $gpu_memory_utilization \
@@ -105,19 +111,37 @@ start_server() {
105111
fi
106112
}
107113

114+
update_best_profile() {
115+
local profile_dir=$1
116+
local profile_index=$2
117+
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
118+
selected_profile_file=
119+
if [[ "$SYSTEM" == "TPU" ]]; then
120+
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
121+
fi
122+
if [[ "$SYSTEM" == "GPU" ]]; then
123+
selected_profile_file="${sorted_paths[$profile_index]}"
124+
fi
125+
rm -f $PROFILE_PATH/*
126+
cp $selected_profile_file $PROFILE_PATH
127+
}
128+
108129
run_benchmark() {
109130
local max_num_seqs=$1
110131
local max_num_batched_tokens=$2
111132
local gpu_memory_utilization=$3
112133
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
113134
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
135+
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
114136
echo "vllm_log: $vllm_log"
115137
echo
116138
rm -f $vllm_log
139+
mkdir -p $profile_dir
117140
pkill -f vllm
141+
local profile_index=0
118142

119143
echo "starting server..."
120-
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
144+
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
121145
result=$?
122146
if [[ "$result" -eq 1 ]]; then
123147
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@@ -144,7 +168,8 @@ run_benchmark() {
144168
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
145169
--num-prompts 1000 \
146170
--random-prefix-len $prefix_len \
147-
--port 8004 &> "$bm_log"
171+
--port 8004 \
172+
--profile &> "$bm_log"
148173
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
149174
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
150175
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@@ -158,6 +183,7 @@ run_benchmark() {
158183
# start from request-rate as int(throughput) + 1
159184
request_rate=$((${throughput%.*} + 1))
160185
while ((request_rate > 0)); do
186+
profile_index=$((profile_index+1))
161187
# clear prefix cache
162188
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
163189
sleep 5
@@ -195,6 +221,12 @@ run_benchmark() {
195221
best_max_num_seqs=$max_num_seqs
196222
best_num_batched_tokens=$max_num_batched_tokens
197223
best_goodput=$goodput
224+
if [[ "$SYSTEM" == "TPU" ]]; then
225+
update_best_profile "$profile_dir/plugins/profile" $profile_index
226+
fi
227+
if [[ "$SYSTEM" == "GPU" ]]; then
228+
update_best_profile "$profile_dir" $profile_index
229+
fi
198230
fi
199231
else
200232
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
239271
done
240272
done
241273
echo "finish permutations"
242-
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
243-
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
274+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
275+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
244276

0 commit comments

Comments
 (0)