Skip to content

Commit c635c5f

Browse files
dtransposeddtransposedywang96
authored
[Misc][Benchmarking] Add variable request-rate ("ramp-up") to the benchmarking client. (#19423)
Signed-off-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal> Co-authored-by: dtransposed <damian@damian-ml-machine.europe-west3-b.c.jetbrains-grazie.internal> Co-authored-by: Roger Wang <hey@rogerw.me>
1 parent a045b7e commit c635c5f

File tree

3 files changed

+330
-34
lines changed

3 files changed

+330
-34
lines changed

benchmarks/README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,21 @@ python3 vllm/benchmarks/benchmark_serving.py \
269269
--num-prompts 10
270270
```
271271

272+
### Running With Ramp-Up Request Rate
273+
274+
The benchmark tool also supports ramping up the request rate over the
275+
duration of the benchmark run. This can be useful for stress testing the
276+
server or finding the maximum throughput that it can handle, given some latency budget.
277+
278+
Two ramp-up strategies are supported:
279+
- `linear`: Increases the request rate linearly from a start value to an end value.
280+
- `exponential`: Increases the request rate exponentially.
281+
282+
The following arguments can be used to control the ramp-up:
283+
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
284+
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
285+
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
286+
272287
---
273288
## Example - Offline Throughput Benchmark
274289

benchmarks/benchmark_serving.py

Lines changed: 161 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from collections.abc import AsyncGenerator, Iterable
3434
from dataclasses import dataclass
3535
from datetime import datetime
36-
from typing import Any, Optional
36+
from typing import Any, Literal, Optional
3737

3838
import numpy as np
3939
from tqdm.asyncio import tqdm
@@ -107,14 +107,42 @@ class BenchmarkMetrics:
107107
percentiles_e2el_ms: list[tuple[float, float]]
108108

109109

110+
def _get_current_request_rate(
111+
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
112+
ramp_up_start_rps: Optional[int],
113+
ramp_up_end_rps: Optional[int],
114+
request_index: int,
115+
total_requests: int,
116+
request_rate: float,
117+
) -> float:
118+
if (
119+
ramp_up_strategy
120+
and ramp_up_start_rps is not None
121+
and ramp_up_end_rps is not None
122+
):
123+
progress = request_index / max(total_requests - 1, 1)
124+
if ramp_up_strategy == "linear":
125+
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
126+
return ramp_up_start_rps + increase
127+
elif ramp_up_strategy == "exponential":
128+
ratio = ramp_up_end_rps / ramp_up_start_rps
129+
return ramp_up_start_rps * (ratio**progress)
130+
else:
131+
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
132+
return request_rate
133+
134+
110135
async def get_request(
111136
input_requests: list[SampleRequest],
112137
request_rate: float,
113138
burstiness: float = 1.0,
114-
) -> AsyncGenerator[SampleRequest, None]:
139+
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
140+
ramp_up_start_rps: Optional[int] = None,
141+
ramp_up_end_rps: Optional[int] = None,
142+
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
115143
"""
116144
Asynchronously generates requests at a specified rate
117-
with OPTIONAL burstiness.
145+
with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
118146
119147
Args:
120148
input_requests:
@@ -129,22 +157,44 @@ async def get_request(
129157
A lower burstiness value (0 < burstiness < 1) results
130158
in more bursty requests, while a higher burstiness value
131159
(burstiness > 1) results in a more uniform arrival of requests.
160+
ramp_up_strategy (optional):
161+
The ramp-up strategy. Can be "linear" or "exponential".
162+
If None, uses constant request rate (specified by request_rate).
163+
ramp_up_start_rps (optional):
164+
The starting request rate for ramp-up.
165+
ramp_up_end_rps (optional):
166+
The ending request rate for ramp-up.
132167
"""
133-
input_requests: Iterable[SampleRequest] = iter(input_requests)
134-
135-
# Calculate scale parameter theta to maintain the desired request_rate.
136168
assert burstiness > 0, (
137169
f"A positive burstiness factor is expected, but given {burstiness}."
138170
)
139-
theta = 1.0 / (request_rate * burstiness)
171+
# Convert to list to get length for ramp-up calculations
172+
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
173+
input_requests = list(input_requests)
174+
175+
total_requests = len(input_requests)
176+
request_index = 0
140177

141178
for request in input_requests:
142-
yield request
179+
current_request_rate = _get_current_request_rate(
180+
ramp_up_strategy,
181+
ramp_up_start_rps,
182+
ramp_up_end_rps,
183+
request_index,
184+
total_requests,
185+
request_rate,
186+
)
187+
188+
yield request, current_request_rate
143189

144-
if request_rate == float("inf"):
190+
request_index += 1
191+
192+
if current_request_rate == float("inf"):
145193
# If the request rate is infinity, then we don't need to wait.
146194
continue
147195

196+
theta = 1.0 / (current_request_rate * burstiness)
197+
148198
# Sample the request interval from the gamma distribution.
149199
# If burstiness is 1, it follows exponential distribution.
150200
interval = np.random.gamma(shape=burstiness, scale=theta)
@@ -290,6 +340,9 @@ async def benchmark(
290340
max_concurrency: Optional[int],
291341
lora_modules: Optional[Iterable[str]],
292342
extra_body: Optional[dict],
343+
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
344+
ramp_up_start_rps: Optional[int] = None,
345+
ramp_up_end_rps: Optional[int] = None,
293346
):
294347
if backend in ASYNC_REQUEST_FUNCS:
295348
request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -353,7 +406,15 @@ async def benchmark(
353406

354407
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
355408

356-
print(f"Traffic request rate: {request_rate}")
409+
if ramp_up_strategy is not None:
410+
print(
411+
f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
412+
f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
413+
"the duration of the benchmark."
414+
)
415+
else:
416+
print(f"Traffic request rate: {request_rate} RPS.")
417+
357418
print(f"Burstiness factor: {burstiness} ({distribution})")
358419
print(f"Maximum request concurrency: {max_concurrency}")
359420

@@ -373,7 +434,34 @@ async def limited_request_func(request_func_input, pbar):
373434

374435
benchmark_start_time = time.perf_counter()
375436
tasks: list[asyncio.Task] = []
376-
async for request in get_request(input_requests, request_rate, burstiness):
437+
438+
rps_change_events = []
439+
last_int_rps = -1
440+
if ramp_up_strategy is not None and ramp_up_start_rps is not None:
441+
last_int_rps = ramp_up_start_rps
442+
rps_change_events.append(
443+
{
444+
"rps": last_int_rps,
445+
"timestamp": datetime.now().isoformat(),
446+
}
447+
)
448+
449+
async for request, current_request_rate in get_request(
450+
input_requests,
451+
request_rate,
452+
burstiness,
453+
ramp_up_strategy,
454+
ramp_up_start_rps,
455+
ramp_up_end_rps,
456+
):
457+
if ramp_up_strategy is not None:
458+
current_int_rps = int(current_request_rate)
459+
if current_int_rps > last_int_rps:
460+
timestamp = datetime.now().isoformat()
461+
for rps_val in range(last_int_rps + 1, current_int_rps + 1):
462+
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
463+
last_int_rps = current_int_rps
464+
377465
prompt, prompt_len, output_len, mm_content = (
378466
request.prompt,
379467
request.prompt_len,
@@ -397,11 +485,8 @@ async def limited_request_func(request_func_input, pbar):
397485
ignore_eos=ignore_eos,
398486
extra_body=extra_body,
399487
)
400-
tasks.append(
401-
asyncio.create_task(
402-
limited_request_func(request_func_input=request_func_input, pbar=pbar)
403-
)
404-
)
488+
task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
489+
tasks.append(asyncio.create_task(task))
405490
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
406491

407492
if profile:
@@ -477,6 +562,9 @@ async def limited_request_func(request_func_input, pbar):
477562
"errors": [output.error for output in outputs],
478563
}
479564

565+
if rps_change_events:
566+
result["rps_change_events"] = rps_change_events
567+
480568
def process_one_metric(
481569
# E.g., "ttft"
482570
metric_attribute_name: str,
@@ -610,6 +698,26 @@ def main(args: argparse.Namespace):
610698
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
611699
tokenizer_mode = args.tokenizer_mode
612700

701+
# Validate ramp-up arguments
702+
if args.ramp_up_strategy is not None:
703+
if args.request_rate != float("inf"):
704+
raise ValueError(
705+
"When using ramp-up, do not specify --request-rate. "
706+
"The request rate will be controlled by ramp-up parameters. "
707+
"Please remove the --request-rate argument."
708+
)
709+
if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
710+
raise ValueError(
711+
"When using --ramp-up-strategy, both --ramp-up-start-rps and "
712+
"--ramp-up-end-rps must be specified"
713+
)
714+
if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
715+
raise ValueError("Ramp-up start and end RPS must be non-negative")
716+
if args.ramp_up_start_rps > args.ramp_up_end_rps:
717+
raise ValueError("Ramp-up start RPS must be less than end RPS")
718+
if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
719+
raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
720+
613721
if args.base_url is not None:
614722
api_url = f"{args.base_url}{args.endpoint}"
615723
base_url = f"{args.base_url}"
@@ -802,6 +910,9 @@ def main(args: argparse.Namespace):
802910
max_concurrency=args.max_concurrency,
803911
lora_modules=args.lora_modules,
804912
extra_body=sampling_params,
913+
ramp_up_strategy=args.ramp_up_strategy,
914+
ramp_up_start_rps=args.ramp_up_start_rps,
915+
ramp_up_end_rps=args.ramp_up_end_rps,
805916
)
806917
)
807918

@@ -834,6 +945,11 @@ def main(args: argparse.Namespace):
834945
result_json["burstiness"] = args.burstiness
835946
result_json["max_concurrency"] = args.max_concurrency
836947

948+
if args.ramp_up_strategy is not None:
949+
result_json["ramp_up_strategy"] = args.ramp_up_strategy
950+
result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
951+
result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
952+
837953
# Merge with benchmark result
838954
result_json = {**result_json, **benchmark_result}
839955

@@ -859,7 +975,10 @@ def main(args: argparse.Namespace):
859975
if args.max_concurrency is not None
860976
else ""
861977
)
862-
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
978+
if args.ramp_up_strategy is not None:
979+
file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
980+
else:
981+
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
863982
if args.result_filename:
864983
file_name = args.result_filename
865984
if args.result_dir:
@@ -1225,6 +1344,31 @@ def create_argument_parser():
12251344
"script chooses a LoRA module at random.",
12261345
)
12271346

1347+
parser.add_argument(
1348+
"--ramp-up-strategy",
1349+
type=str,
1350+
default=None,
1351+
choices=["linear", "exponential"],
1352+
help="The ramp-up strategy. This would be used to "
1353+
"ramp up the request rate from initial RPS to final "
1354+
"RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
1355+
"over the duration of the benchmark.",
1356+
)
1357+
parser.add_argument(
1358+
"--ramp-up-start-rps",
1359+
type=int,
1360+
default=None,
1361+
help="The starting request rate for ramp-up (RPS). "
1362+
"Needs to be specified when --ramp-up-strategy is used.",
1363+
)
1364+
parser.add_argument(
1365+
"--ramp-up-end-rps",
1366+
type=int,
1367+
default=None,
1368+
help="The ending request rate for ramp-up (RPS). "
1369+
"Needs to be specified when --ramp-up-strategy is used.",
1370+
)
1371+
12281372
return parser
12291373

12301374

0 commit comments

Comments
 (0)