Skip to content

Commit c3453df

Browse files
Squashed feat/max-error-rate
Signed-off-by: Alon Kellner <akellner@redhat.com> Co-authored-by: Mark Vakhansky <mvakhans@redhat.com>
1 parent f1f8ca8 commit c3453df

27 files changed

+867
-158
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ cython_debug/
178178
# Project specific files
179179
*.json
180180
*.yaml
181+
/bin
182+
uv.lock
181183

182184
# But not scenarios
183185
!src/guidellm/benchmark/scenarios/*.json

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
147147

148148
- `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
149149

150+
- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.
151+
150152
- `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
151153

152154
- `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ dev = [
7878
"pytest-cov~=5.0.0",
7979
"pytest-mock~=3.14.0",
8080
"pytest-rerunfailures~=14.0",
81+
"pytest-timeout~=2.4.0",
8182
"respx~=0.22.0",
8283

8384
# code quality

src/guidellm/__main__.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,12 +178,28 @@ def benchmark():
178178
"If None, will run until max_seconds or the data is exhausted."
179179
),
180180
)
181+
@click.option(
182+
"--max-error",
183+
type=float,
184+
help=(
185+
"The maximum error after which a benchmark will stop. "
186+
"Can either be a rate i.e 0 < rate < 1 or constant number. "
187+
"If rate is given and rate_type is 'constant' and 'max_seconds' exists "
188+
"then the rate will be calculated as part of the total expected "
189+
"requests count i.e rate * duration. If rate is given and number"
190+
"of requests is not pre-determined than a context window "
191+
"of the last requests will be looked at. Context window size"
192+
"is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
193+
"If a number above 1 is given than we just count the total"
194+
"number of error and check if it's above the threshold."
195+
),
196+
)
181197
@click.option(
182198
"--warmup-percent",
183199
type=float,
184200
default=GenerativeTextScenario.get_default("warmup_percent"),
185201
help=(
186-
"The percent of the benchmark (based on max-seconds, max-requets, "
202+
"The percent of the benchmark (based on max-seconds, max-requests, "
187203
"or lenth of dataset) to run as a warmup and not include in the final results. "
188204
"Defaults to None."
189205
),
@@ -193,7 +209,7 @@ def benchmark():
193209
type=float,
194210
default=GenerativeTextScenario.get_default("cooldown_percent"),
195211
help=(
196-
"The percent of the benchmark (based on max-seconds, max-requets, or lenth "
212+
"The percent of the benchmark (based on max-seconds, max-requests, or length "
197213
"of dataset) to run as a cooldown and not include in the final results. "
198214
"Defaults to None."
199215
),
@@ -259,6 +275,7 @@ def run(
259275
rate,
260276
max_seconds,
261277
max_requests,
278+
max_error,
262279
warmup_percent,
263280
cooldown_percent,
264281
disable_progress,
@@ -286,6 +303,7 @@ def run(
286303
rate=rate,
287304
max_seconds=max_seconds,
288305
max_requests=max_requests,
306+
max_error=max_error,
289307
warmup_percent=warmup_percent,
290308
cooldown_percent=cooldown_percent,
291309
output_sampling=output_sampling,

src/guidellm/backend/openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ def __init__(
102102
raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
103103

104104
if self._target.endswith("/v1") or self._target.endswith("/v1/"):
105-
# backwards compatability, strip v1 off
105+
# backwards compatibility, strip v1 off
106106
self._target = self._target[:-3]
107107

108108
if self._target.endswith("/"):

src/guidellm/benchmark/aggregator.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
11
import time
22
from abc import ABC, abstractmethod
33
from pathlib import Path
4-
from typing import (
5-
Any,
6-
Generic,
7-
Literal,
8-
Optional,
9-
TypeVar,
10-
Union,
11-
)
4+
from typing import Any, Generic, Literal, Optional, TypeVar, Union, get_args
125

136
from pydantic import Field
147

158
from guidellm.backend import ResponseSummary
169
from guidellm.benchmark.benchmark import (
10+
REASON_STATUS_MAPPING,
1711
BenchmarkArgs,
1812
BenchmarkRunStats,
1913
BenchmarkT,
@@ -40,6 +34,7 @@
4034
SchedulerRequestResult,
4135
WorkerDescription,
4236
)
37+
from guidellm.scheduler.result import TerminationReason
4338
from guidellm.utils import check_load_processor
4439

4540
__all__ = [
@@ -305,6 +300,24 @@ class BenchmarkAggregator(
305300
total=None,
306301
),
307302
)
303+
current_window: int = Field(
304+
description=(
305+
"The current accumulated window size for error checking. "
306+
"This is a number between 0 and the value of "
307+
"GUIDELLM__ERROR_CHECK_WINDOW_SIZE"
308+
),
309+
default=0,
310+
)
311+
errors_in_window: int = Field(
312+
description=("The amount of errored requests in the current window."),
313+
default=0,
314+
)
315+
termination_reason: TerminationReason = Field(
316+
description=(
317+
f"The benchmark termination reason, one of: {get_args(TerminationReason)}"
318+
),
319+
default="interrupted",
320+
)
308321

309322
def add_result(
310323
self,
@@ -600,6 +613,8 @@ def compile(self) -> GenerativeBenchmark:
600613
"""
601614
successful, incomplete, errored = self._compile_results()
602615

616+
error_rate, window_error_rate = self._calculate_error_rate()
617+
603618
return GenerativeBenchmark.from_stats(
604619
run_id=self.run_id,
605620
successful=successful,
@@ -625,12 +640,28 @@ def compile(self) -> GenerativeBenchmark:
625640
request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
626641
request_time_delay_avg=self.requests_stats.request_time_delay.mean,
627642
request_time_avg=self.requests_stats.request_time.mean,
643+
error_rate=error_rate,
644+
window_error_rate=window_error_rate,
645+
status=REASON_STATUS_MAPPING[self.termination_reason],
646+
termination_reason=self.termination_reason,
628647
),
629648
worker=self.worker_description,
630649
requests_loader=self.request_loader_description,
631650
extras=self.extras,
632651
)
633652

653+
def _calculate_error_rate(self) -> tuple[float, float]:
654+
total_successful = self.requests_stats.totals.successful.total
655+
total_errored = self.requests_stats.totals.errored.total
656+
total_finished = total_errored + total_successful
657+
error_rate = 0.0 if total_finished == 0 else (total_errored / total_finished)
658+
window_error_rate = (
659+
0.0
660+
if self.current_window == 0
661+
else self.errors_in_window / self.current_window
662+
)
663+
return error_rate, window_error_rate
664+
634665
def _compile_results(
635666
self,
636667
) -> tuple[

src/guidellm/benchmark/benchmark.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import random
22
import uuid
3-
from typing import Any, Literal, Optional, TypeVar, Union
3+
from typing import Any, Literal, Optional, TypeVar, Union, get_args
44

55
from pydantic import Field, computed_field
66

@@ -32,6 +32,7 @@
3232
ThroughputStrategy,
3333
WorkerDescription,
3434
)
35+
from guidellm.scheduler.result import TerminationReason
3536

3637
__all__ = [
3738
"Benchmark",
@@ -46,6 +47,14 @@
4647
"StatusBreakdown",
4748
]
4849

50+
BenchmarkStatus = Literal["success", "error", "interrupted"]
51+
REASON_STATUS_MAPPING: dict[TerminationReason, BenchmarkStatus] = {
52+
"interrupted": "interrupted",
53+
"max_error_reached": "error",
54+
"max_seconds_reached": "success",
55+
"max_requests_reached": "success",
56+
}
57+
4958

5059
class BenchmarkArgs(StandardBaseModel):
5160
"""
@@ -90,6 +99,9 @@ class BenchmarkArgs(StandardBaseModel):
9099
max_duration: Optional[float] = Field(
91100
description="The maximum duration in seconds to run this benchmark, if any."
92101
)
102+
max_error: Optional[float] = Field(
103+
description="Maximum error rate or const after which a benchmark will stop."
104+
)
93105
warmup_number: Optional[int] = Field(
94106
description=(
95107
"The number of requests to run for the warmup phase of this benchmark, "
@@ -213,6 +225,34 @@ class BenchmarkRunStats(StandardBaseModel):
213225
"it was completed."
214226
)
215227
)
228+
error_rate: float = Field(
229+
description=(
230+
"The number of total errored requests divided by the number "
231+
"of total successful and errored requests at the end of benchmark. "
232+
)
233+
)
234+
window_error_rate: float = Field(
235+
description=(
236+
"The number of errored requests within the error checking window"
237+
"divided by the window size at the end of benchmark. "
238+
"If the window_error_rate is above the max_error "
239+
"the termination_reason should be 'max_error_reached'. "
240+
"You may configure the error checking window size by setting "
241+
"the environment variable GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
242+
)
243+
)
244+
status: BenchmarkStatus = Field(
245+
description=(
246+
f"The status of the benchmark output, "
247+
f"one of the following options: {get_args(BenchmarkStatus)}."
248+
)
249+
)
250+
termination_reason: TerminationReason = Field(
251+
description=(
252+
"The reason for the benchmark termination, "
253+
f"one of the following options: {get_args(TerminationReason)}."
254+
)
255+
)
216256

217257

218258
class BenchmarkMetrics(StandardBaseModel):

src/guidellm/benchmark/benchmarker.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
7474
description="Maximum duration (in seconds) to process requests per strategy.",
7575
ge=0,
7676
)
77+
max_error_per_strategy: Optional[float] = Field(
78+
description="Maximum error after which a "
79+
"benchmark will stop,"
80+
" either rate or fixed number",
81+
ge=0,
82+
)
7783
warmup_percent_per_strategy: Optional[float] = Field(
7884
description="Percentage of requests to use for warmup.",
7985
ge=0,
@@ -99,6 +105,10 @@ def max_number(self) -> Optional[int]:
99105
def max_duration(self) -> Optional[float]:
100106
return self.max_duration_per_strategy
101107

108+
@property
109+
def max_error(self) -> Optional[float]:
110+
return self.max_error_per_strategy
111+
102112
@property
103113
def warmup_number(self) -> Optional[int]:
104114
if self.warmup_percent_per_strategy is None or self.max_number is None:
@@ -148,6 +158,7 @@ async def run(
148158
profile: Profile,
149159
max_number_per_strategy: Optional[int],
150160
max_duration_per_strategy: Optional[float],
161+
max_error_per_strategy: Optional[float],
151162
warmup_percent_per_strategy: Optional[float],
152163
cooldown_percent_per_strategy: Optional[float],
153164
) -> AsyncGenerator[
@@ -162,6 +173,7 @@ async def run(
162173
requests_loader_size=requests_loader_size,
163174
max_number_per_strategy=max_number_per_strategy,
164175
max_duration_per_strategy=max_duration_per_strategy,
176+
max_error_per_strategy=max_error_per_strategy,
165177
warmup_percent_per_strategy=warmup_percent_per_strategy,
166178
cooldown_percent_per_strategy=cooldown_percent_per_strategy,
167179
)
@@ -196,6 +208,7 @@ async def run(
196208
scheduling_strategy=scheduling_strategy,
197209
max_number=max_number_per_strategy,
198210
max_duration=max_duration_per_strategy,
211+
max_error=max_error_per_strategy,
199212
):
200213
if result.type_ == "run_start":
201214
yield BenchmarkerResult(
@@ -210,6 +223,9 @@ async def run(
210223
current_result=None,
211224
)
212225
elif result.type_ == "run_complete":
226+
aggregator.termination_reason = result.run_info.termination_reason
227+
aggregator.current_window = result.run_info.current_window
228+
aggregator.errors_in_window = result.run_info.errors_in_window
213229
yield BenchmarkerResult(
214230
type_="scheduler_complete",
215231
start_time=start_time,
@@ -321,6 +337,7 @@ def create_benchmark_aggregator(
321337
strategy=strategy,
322338
max_number=limits.max_number,
323339
max_duration=limits.max_duration,
340+
max_error=limits.max_error,
324341
warmup_number=limits.warmup_number,
325342
warmup_duration=limits.warmup_duration,
326343
cooldown_number=limits.cooldown_number,

src/guidellm/benchmark/entrypoints.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ async def benchmark_generative_text(
5353
rate: Optional[Union[float, list[float]]],
5454
max_seconds: Optional[float],
5555
max_requests: Optional[int],
56+
max_error: Optional[float],
5657
warmup_percent: Optional[float],
5758
cooldown_percent: Optional[float],
5859
output_path: Optional[Union[str, Path]],
@@ -119,6 +120,7 @@ async def benchmark_generative_text(
119120
profile=profile,
120121
max_number_per_strategy=max_requests,
121122
max_duration_per_strategy=max_seconds,
123+
max_error_per_strategy=max_error,
122124
warmup_percent_per_strategy=warmup_percent,
123125
cooldown_percent_per_strategy=cooldown_percent,
124126
):

src/guidellm/benchmark/output.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,7 @@ def benchmarks_args_str(self) -> str:
422422
{
423423
"max_number": args.max_number,
424424
"max_duration": args.max_duration,
425+
"max_error": args.max_error,
425426
"warmup_number": args.warmup_number,
426427
"warmup_duration": args.warmup_duration,
427428
"cooldown_number": args.cooldown_number,

0 commit comments

Comments
 (0)