Skip to content

Commit c13eddf

Browse files
committed
pre-commit
1 parent 34483a3 commit c13eddf

File tree

2 files changed

+229
-190
lines changed

2 files changed

+229
-190
lines changed

benchmarks/profiling/benchmark_latency.py

Lines changed: 61 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""Benchmark the latency of processing a single batch of requests."""
3+
34
import argparse
45
import dataclasses
56
import json
@@ -26,6 +27,7 @@ def main(args: argparse.Namespace):
2627
@contextmanager
2728
def rpd_profiler_context():
2829
from rpdTracerControl import rpdTracerControl as rpd
30+
2931
llm.start_profile()
3032
yield
3133
llm.stop_profile()
@@ -39,15 +41,16 @@ def torch_profiler_context(profile_result_dir: Optional[str] = None):
3941
torch.profiler.ProfilerActivity.CUDA,
4042
],
4143
on_trace_ready=torch.profiler.tensorboard_trace_handler(
42-
str(profile_result_dir)))
44+
str(profile_result_dir)
45+
),
46+
)
4347
p.start()
4448
try:
4549
with torch.no_grad():
4650
yield p
4751
finally:
4852
p.stop()
49-
print(p.key_averages().table(sort_by="self_cuda_time_total",
50-
row_limit=-1))
53+
print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
5154

5255
def get_profiling_context(profile_result_dir: Optional[str] = None):
5356
if args.profile_torch:
@@ -58,15 +61,16 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
5861
return nullcontext()
5962

6063
if args.profile_torch or args.profile_rpd:
61-
profile_result_dir = Path(args.profile_result_dir
62-
or "./vllm_benchmark_latency_result")
64+
profile_result_dir = Path(
65+
args.profile_result_dir or "./vllm_benchmark_latency_result"
66+
)
6367
profile_result_dir.mkdir(parents=True, exist_ok=True)
6468
name = os.path.basename(os.path.normpath(args.model))
6569
model_trace_name = (
6670
f"{name}_in_{args.input_len}_out_{args.output_len}_"
67-
f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}")
68-
print(
69-
f"Profiling (results will be saved to '{profile_result_dir}')...")
71+
f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}"
72+
)
73+
print(f"Profiling (results will be saved to '{profile_result_dir}')...")
7074
if args.profile_rpd:
7175
profile_result_dir /= f"{model_trace_name}.rpd"
7276
os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_result_dir)
@@ -85,26 +89,25 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
8589
max_tokens=args.output_len,
8690
)
8791
print(sampling_params)
88-
dummy_prompt_token_ids = np.random.randint(10000,
89-
size=(args.batch_size,
90-
args.input_len))
91-
dummy_prompts: list[PromptType] = [{
92-
"prompt_token_ids": batch
93-
} for batch in dummy_prompt_token_ids.tolist()]
92+
dummy_prompt_token_ids = np.random.randint(
93+
10000, size=(args.batch_size, args.input_len)
94+
)
95+
dummy_prompts: list[PromptType] = [
96+
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
97+
]
9498

9599
def llm_generate():
96100
if not args.use_beam_search:
97-
llm.generate(dummy_prompts,
98-
sampling_params=sampling_params,
99-
use_tqdm=False)
101+
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
100102
else:
101103
llm.beam_search(
102104
dummy_prompts,
103105
BeamSearchParams(
104106
beam_width=args.n,
105107
max_tokens=args.output_len,
106108
ignore_eos=True,
107-
))
109+
),
110+
)
108111

109112
def run_to_completion(profile_dir: Optional[str] = None):
110113
if profile_dir:
@@ -132,9 +135,9 @@ def run_to_completion(profile_dir: Optional[str] = None):
132135
latencies = np.array(latencies)
133136
percentages = [10, 25, 50, 75, 90, 99]
134137
percentiles = np.percentile(latencies, percentages)
135-
print(f'Avg latency: {np.mean(latencies)} seconds')
138+
print(f"Avg latency: {np.mean(latencies)} seconds")
136139
for percentage, percentile in zip(percentages, percentiles):
137-
print(f'{percentage}% percentile latency: {percentile} seconds')
140+
print(f"{percentage}% percentile latency: {percentile} seconds")
138141

139142
# Output JSON results if specified
140143
if args.output_json:
@@ -147,45 +150,52 @@ def run_to_completion(profile_dir: Optional[str] = None):
147150
json.dump(results, f, indent=4)
148151

149152

150-
if __name__ == '__main__':
153+
if __name__ == "__main__":
151154
parser = FlexibleArgumentParser(
152-
description='Benchmark the latency of processing a single batch of '
153-
'requests till completion.')
154-
parser.add_argument('--input-len', type=int, default=32)
155-
parser.add_argument('--output-len', type=int, default=128)
156-
parser.add_argument('--batch-size', type=int, default=8)
157-
parser.add_argument('--n',
158-
type=int,
159-
default=1,
160-
help='Number of generated sequences per prompt.')
161-
parser.add_argument('--use-beam-search', action='store_true')
162-
parser.add_argument('--num-iters-warmup',
163-
type=int,
164-
default=10,
165-
help='Number of iterations to run for warmup.')
166-
parser.add_argument('--num-iters',
167-
type=int,
168-
default=30,
169-
help='Number of iterations to run.')
155+
description="Benchmark the latency of processing a single batch of "
156+
"requests till completion."
157+
)
158+
parser.add_argument("--input-len", type=int, default=32)
159+
parser.add_argument("--output-len", type=int, default=128)
160+
parser.add_argument("--batch-size", type=int, default=8)
161+
parser.add_argument(
162+
"--n", type=int, default=1, help="Number of generated sequences per prompt."
163+
)
164+
parser.add_argument("--use-beam-search", action="store_true")
165+
parser.add_argument(
166+
"--num-iters-warmup",
167+
type=int,
168+
default=10,
169+
help="Number of iterations to run for warmup.",
170+
)
170171
parser.add_argument(
171-
'--profile-torch',
172-
action='store_true',
173-
help='profile the generation process of a single batch')
172+
"--num-iters", type=int, default=30, help="Number of iterations to run."
173+
)
174174
parser.add_argument(
175-
'--profile-rpd',
176-
action='store_true',
177-
help='profile the generation process of a single batch')
175+
"--profile-torch",
176+
action="store_true",
177+
help="profile the generation process of a single batch",
178+
)
178179
parser.add_argument(
179-
'--profile-result-dir',
180+
"--profile-rpd",
181+
action="store_true",
182+
help="profile the generation process of a single batch",
183+
)
184+
parser.add_argument(
185+
"--profile-result-dir",
180186
type=str,
181-
default=os.getenv('VLLM_RPD_PROFILER_DIR', default=None),
182-
help=('path to save the profiler output. Can be visualized '
183-
'with ui.perfetto.dev or Tensorboard.'))
187+
default=os.getenv("VLLM_RPD_PROFILER_DIR", default=None),
188+
help=(
189+
"path to save the profiler output. Can be visualized "
190+
"with ui.perfetto.dev or Tensorboard."
191+
),
192+
)
184193
parser.add_argument(
185-
'--output-json',
194+
"--output-json",
186195
type=str,
187196
default=None,
188-
help='Path to save the latency results in JSON format.')
197+
help="Path to save the latency results in JSON format.",
198+
)
189199

190200
parser = EngineArgs.add_cli_args(parser)
191201
args = parser.parse_args()

0 commit comments

Comments
 (0)