1
1
# SPDX-License-Identifier: Apache-2.0
2
2
"""Benchmark the latency of processing a single batch of requests."""
3
+
3
4
import argparse
4
5
import dataclasses
5
6
import json
@@ -26,6 +27,7 @@ def main(args: argparse.Namespace):
26
27
@contextmanager
27
28
def rpd_profiler_context ():
28
29
from rpdTracerControl import rpdTracerControl as rpd
30
+
29
31
llm .start_profile ()
30
32
yield
31
33
llm .stop_profile ()
@@ -39,15 +41,16 @@ def torch_profiler_context(profile_result_dir: Optional[str] = None):
39
41
torch .profiler .ProfilerActivity .CUDA ,
40
42
],
41
43
on_trace_ready = torch .profiler .tensorboard_trace_handler (
42
- str (profile_result_dir )))
44
+ str (profile_result_dir )
45
+ ),
46
+ )
43
47
p .start ()
44
48
try :
45
49
with torch .no_grad ():
46
50
yield p
47
51
finally :
48
52
p .stop ()
49
- print (p .key_averages ().table (sort_by = "self_cuda_time_total" ,
50
- row_limit = - 1 ))
53
+ print (p .key_averages ().table (sort_by = "self_cuda_time_total" , row_limit = - 1 ))
51
54
52
55
def get_profiling_context (profile_result_dir : Optional [str ] = None ):
53
56
if args .profile_torch :
@@ -58,15 +61,16 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
58
61
return nullcontext ()
59
62
60
63
if args .profile_torch or args .profile_rpd :
61
- profile_result_dir = Path (args .profile_result_dir
62
- or "./vllm_benchmark_latency_result" )
64
+ profile_result_dir = Path (
65
+ args .profile_result_dir or "./vllm_benchmark_latency_result"
66
+ )
63
67
profile_result_dir .mkdir (parents = True , exist_ok = True )
64
68
name = os .path .basename (os .path .normpath (args .model ))
65
69
model_trace_name = (
66
70
f"{ name } _in_{ args .input_len } _out_{ args .output_len } _"
67
- f"batch_{ args .batch_size } _tp_{ args .tensor_parallel_size } " )
68
- print (
69
- f"Profiling (results will be saved to '{ profile_result_dir } ')..." )
71
+ f"batch_{ args .batch_size } _tp_{ args .tensor_parallel_size } "
72
+ )
73
+ print ( f"Profiling (results will be saved to '{ profile_result_dir } ')..." )
70
74
if args .profile_rpd :
71
75
profile_result_dir /= f"{ model_trace_name } .rpd"
72
76
os .environ ["VLLM_RPD_PROFILER_DIR" ] = str (profile_result_dir )
@@ -85,26 +89,25 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
85
89
max_tokens = args .output_len ,
86
90
)
87
91
print (sampling_params )
88
- dummy_prompt_token_ids = np .random .randint (10000 ,
89
- size = (args .batch_size ,
90
- args . input_len ) )
91
- dummy_prompts : list [PromptType ] = [{
92
- "prompt_token_ids" : batch
93
- } for batch in dummy_prompt_token_ids . tolist () ]
92
+ dummy_prompt_token_ids = np .random .randint (
93
+ 10000 , size = (args .batch_size , args . input_len )
94
+ )
95
+ dummy_prompts : list [PromptType ] = [
96
+ { "prompt_token_ids" : batch } for batch in dummy_prompt_token_ids . tolist ()
97
+ ]
94
98
95
99
def llm_generate ():
96
100
if not args .use_beam_search :
97
- llm .generate (dummy_prompts ,
98
- sampling_params = sampling_params ,
99
- use_tqdm = False )
101
+ llm .generate (dummy_prompts , sampling_params = sampling_params , use_tqdm = False )
100
102
else :
101
103
llm .beam_search (
102
104
dummy_prompts ,
103
105
BeamSearchParams (
104
106
beam_width = args .n ,
105
107
max_tokens = args .output_len ,
106
108
ignore_eos = True ,
107
- ))
109
+ ),
110
+ )
108
111
109
112
def run_to_completion (profile_dir : Optional [str ] = None ):
110
113
if profile_dir :
@@ -132,9 +135,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
132
135
latencies = np .array (latencies )
133
136
percentages = [10 , 25 , 50 , 75 , 90 , 99 ]
134
137
percentiles = np .percentile (latencies , percentages )
135
- print (f' Avg latency: { np .mean (latencies )} seconds' )
138
+ print (f" Avg latency: { np .mean (latencies )} seconds" )
136
139
for percentage , percentile in zip (percentages , percentiles ):
137
- print (f' { percentage } % percentile latency: { percentile } seconds' )
140
+ print (f" { percentage } % percentile latency: { percentile } seconds" )
138
141
139
142
# Output JSON results if specified
140
143
if args .output_json :
@@ -147,45 +150,52 @@ def run_to_completion(profile_dir: Optional[str] = None):
147
150
json .dump (results , f , indent = 4 )
148
151
149
152
150
- if __name__ == ' __main__' :
153
+ if __name__ == " __main__" :
151
154
parser = FlexibleArgumentParser (
152
- description = 'Benchmark the latency of processing a single batch of '
153
- 'requests till completion.' )
154
- parser .add_argument ('--input-len' , type = int , default = 32 )
155
- parser .add_argument ('--output-len' , type = int , default = 128 )
156
- parser .add_argument ('--batch-size' , type = int , default = 8 )
157
- parser .add_argument ('--n' ,
158
- type = int ,
159
- default = 1 ,
160
- help = 'Number of generated sequences per prompt.' )
161
- parser .add_argument ('--use-beam-search' , action = 'store_true' )
162
- parser .add_argument ('--num-iters-warmup' ,
163
- type = int ,
164
- default = 10 ,
165
- help = 'Number of iterations to run for warmup.' )
166
- parser .add_argument ('--num-iters' ,
167
- type = int ,
168
- default = 30 ,
169
- help = 'Number of iterations to run.' )
155
+ description = "Benchmark the latency of processing a single batch of "
156
+ "requests till completion."
157
+ )
158
+ parser .add_argument ("--input-len" , type = int , default = 32 )
159
+ parser .add_argument ("--output-len" , type = int , default = 128 )
160
+ parser .add_argument ("--batch-size" , type = int , default = 8 )
161
+ parser .add_argument (
162
+ "--n" , type = int , default = 1 , help = "Number of generated sequences per prompt."
163
+ )
164
+ parser .add_argument ("--use-beam-search" , action = "store_true" )
165
+ parser .add_argument (
166
+ "--num-iters-warmup" ,
167
+ type = int ,
168
+ default = 10 ,
169
+ help = "Number of iterations to run for warmup." ,
170
+ )
170
171
parser .add_argument (
171
- '--profile-torch' ,
172
- action = 'store_true' ,
173
- help = 'profile the generation process of a single batch' )
172
+ "--num-iters" , type = int , default = 30 , help = "Number of iterations to run."
173
+ )
174
174
parser .add_argument (
175
- '--profile-rpd' ,
176
- action = 'store_true' ,
177
- help = 'profile the generation process of a single batch' )
175
+ "--profile-torch" ,
176
+ action = "store_true" ,
177
+ help = "profile the generation process of a single batch" ,
178
+ )
178
179
parser .add_argument (
179
- '--profile-result-dir' ,
180
+ "--profile-rpd" ,
181
+ action = "store_true" ,
182
+ help = "profile the generation process of a single batch" ,
183
+ )
184
+ parser .add_argument (
185
+ "--profile-result-dir" ,
180
186
type = str ,
181
- default = os .getenv ('VLLM_RPD_PROFILER_DIR' , default = None ),
182
- help = ('path to save the profiler output. Can be visualized '
183
- 'with ui.perfetto.dev or Tensorboard.' ))
187
+ default = os .getenv ("VLLM_RPD_PROFILER_DIR" , default = None ),
188
+ help = (
189
+ "path to save the profiler output. Can be visualized "
190
+ "with ui.perfetto.dev or Tensorboard."
191
+ ),
192
+ )
184
193
parser .add_argument (
185
- ' --output-json' ,
194
+ " --output-json" ,
186
195
type = str ,
187
196
default = None ,
188
- help = 'Path to save the latency results in JSON format.' )
197
+ help = "Path to save the latency results in JSON format." ,
198
+ )
189
199
190
200
parser = EngineArgs .add_cli_args (parser )
191
201
args = parser .parse_args ()
0 commit comments