10
10
# 3. Set variables (ALL REQUIRED)
11
11
# BASE: your directory for vllm repo
12
12
# MODEL: the model served by vllm
13
+ # SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
13
14
# TP: ways of tensor parallelism
14
15
# DOWNLOAD_DIR: directory to download and load model weights.
15
16
# INPUT_LEN: request input len
34
35
TAG=$( date +" %Y_%m_%d_%H_%M" )
35
36
BASE=" "
36
37
MODEL=" meta-llama/Llama-3.1-8B-Instruct"
38
+ SYSTEM=" TPU"
37
39
TP=1
38
40
DOWNLOAD_DIR=" "
39
41
INPUT_LEN=4000
@@ -45,12 +47,15 @@ NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
45
47
46
48
LOG_FOLDER=" $BASE /auto-benchmark/$TAG "
47
49
RESULT=" $LOG_FOLDER /result.txt"
50
+ PROFILE_PATH=" $LOG_FOLDER /profile"
48
51
49
52
echo " result file: $RESULT "
50
53
echo " model: $MODEL "
51
54
52
55
rm -rf $LOG_FOLDER
56
+ rm -rf $PROFILE_PATH
53
57
mkdir -p $LOG_FOLDER
58
+ mkdir -p $PROFILE_PATH
54
59
55
60
cd " $BASE /vllm"
56
61
@@ -70,10 +75,11 @@ start_server() {
70
75
local max_num_seqs=$2
71
76
local max_num_batched_tokens=$3
72
77
local vllm_log=$4
78
+ local profile_dir=$5
73
79
74
80
pkill -f vllm
75
81
76
- VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
82
+ VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR= $profile_dir vllm serve $MODEL \
77
83
--disable-log-requests \
78
84
--port 8004 \
79
85
--gpu-memory-utilization $gpu_memory_utilization \
@@ -105,19 +111,37 @@ start_server() {
105
111
fi
106
112
}
107
113
114
+ update_best_profile () {
115
+ local profile_dir=$1
116
+ local profile_index=$2
117
+ sorted_paths=($( find " $profile_dir " -maxdepth 1 -not -path " $profile_dir " | sort) )
118
+ selected_profile_file=
119
+ if [[ " $SYSTEM " == " TPU" ]]; then
120
+ selected_profile_file=" ${sorted_paths[$profile_index]} /*.xplane.pb"
121
+ fi
122
+ if [[ " $SYSTEM " == " GPU" ]]; then
123
+ selected_profile_file=" ${sorted_paths[$profile_index]} "
124
+ fi
125
+ rm -f $PROFILE_PATH /*
126
+ cp $selected_profile_file $PROFILE_PATH
127
+ }
128
+
108
129
run_benchmark () {
109
130
local max_num_seqs=$1
110
131
local max_num_batched_tokens=$2
111
132
local gpu_memory_utilization=$3
112
133
echo " max_num_seq: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens "
113
134
local vllm_log=" $LOG_FOLDER /vllm_log_${max_num_seqs} _${max_num_batched_tokens} .txt"
135
+ local profile_dir=" $LOG_FOLDER /profile_${max_num_seqs} _${max_num_batched_tokens} "
114
136
echo " vllm_log: $vllm_log "
115
137
echo
116
138
rm -f $vllm_log
139
+ mkdir -p $profile_dir
117
140
pkill -f vllm
141
+ local profile_index=0
118
142
119
143
echo " starting server..."
120
- start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log
144
+ start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
121
145
result=$?
122
146
if [[ " $result " -eq 1 ]]; then
123
147
echo " server failed to start. gpu_memory_utilization:$gpu_memory_utilization , max_num_seqs:$max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens "
@@ -144,7 +168,8 @@ run_benchmark() {
144
168
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
145
169
--num-prompts 1000 \
146
170
--random-prefix-len $prefix_len \
147
- --port 8004 & > " $bm_log "
171
+ --port 8004 \
172
+ --profile & > " $bm_log "
148
173
throughput=$( grep " Request throughput (req/s):" " $bm_log " | sed ' s/[^0-9.]//g' )
149
174
e2el=$( grep " P99 E2EL (ms):" " $bm_log " | awk ' {print $NF}' )
150
175
goodput=$( grep " Request goodput (req/s):" " $bm_log " | sed ' s/[^0-9.]//g' )
@@ -158,6 +183,7 @@ run_benchmark() {
158
183
# start from request-rate as int(throughput) + 1
159
184
request_rate=$(( ${throughput% .* } + 1 ))
160
185
while (( request_rate > 0 )) ; do
186
+ profile_index=$(( profile_index+ 1 ))
161
187
# clear prefix cache
162
188
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
163
189
sleep 5
@@ -195,6 +221,12 @@ run_benchmark() {
195
221
best_max_num_seqs=$max_num_seqs
196
222
best_num_batched_tokens=$max_num_batched_tokens
197
223
best_goodput=$goodput
224
+ if [[ " $SYSTEM " == " TPU" ]]; then
225
+ update_best_profile " $profile_dir /plugins/profile" $profile_index
226
+ fi
227
+ if [[ " $SYSTEM " == " GPU" ]]; then
228
+ update_best_profile " $profile_dir " $profile_index
229
+ fi
198
230
fi
199
231
else
200
232
echo " max_num_seqs: $max_num_seqs , max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS} "
@@ -239,6 +271,6 @@ for num_seqs in "${num_seqs_list[@]}"; do
239
271
done
240
272
done
241
273
echo " finish permutations"
242
- echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput "
243
- echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput " >> " $RESULT "
274
+ echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput , profile saved in: $PROFILE_PATH "
275
+ echo " best_max_num_seqs: $best_max_num_seqs , best_num_batched_tokens: $best_num_batched_tokens , best_throughput: $best_throughput , profile saved in: $PROFILE_PATH " >> " $RESULT "
244
276
0 commit comments