Skip to content

Feat/max error rate - continued #238

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ cython_debug/
# Project specific files
*.json
*.yaml
/bin
uv.lock

# But not scenarios
!src/guidellm/benchmark/scenarios/*.json
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative

- `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.

- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM\_\_ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.

- `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.

- `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ dev = [
"pytest-cov~=5.0.0",
"pytest-mock~=3.14.0",
"pytest-rerunfailures~=14.0",
"pytest-timeout~=2.4.0",
"respx~=0.22.0",

# code quality
Expand Down
22 changes: 20 additions & 2 deletions src/guidellm/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,28 @@ def benchmark():
"If None, will run until max_seconds or the data is exhausted."
),
)
@click.option(
"--max-error",
type=float,
help=(
"The maximum error after which a benchmark will stop. "
"Can either be a rate i.e 0 < rate < 1 or constant number. "
"If rate is given and rate_type is 'constant' and 'max_seconds' exists "
"then the rate will be calculated as part of the total expected "
"requests count i.e rate * duration. If rate is given and number"
"of requests is not pre-determined than a context window "
"of the last requests will be looked at. Context window size"
"is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
"If a number above 1 is given than we just count the total"
"number of error and check if it's above the threshold."
),
)
@click.option(
"--warmup-percent",
type=float,
default=GenerativeTextScenario.get_default("warmup_percent"),
help=(
"The percent of the benchmark (based on max-seconds, max-requets, "
"The percent of the benchmark (based on max-seconds, max-requests, "
"or lenth of dataset) to run as a warmup and not include in the final results. "
"Defaults to None."
),
Expand All @@ -193,7 +209,7 @@ def benchmark():
type=float,
default=GenerativeTextScenario.get_default("cooldown_percent"),
help=(
"The percent of the benchmark (based on max-seconds, max-requets, or lenth "
"The percent of the benchmark (based on max-seconds, max-requests, or length "
"of dataset) to run as a cooldown and not include in the final results. "
"Defaults to None."
),
Expand Down Expand Up @@ -259,6 +275,7 @@ def run(
rate,
max_seconds,
max_requests,
max_error,
warmup_percent,
cooldown_percent,
disable_progress,
Expand Down Expand Up @@ -286,6 +303,7 @@ def run(
rate=rate,
max_seconds=max_seconds,
max_requests=max_requests,
max_error=max_error,
warmup_percent=warmup_percent,
cooldown_percent=cooldown_percent,
output_sampling=output_sampling,
Expand Down
2 changes: 1 addition & 1 deletion src/guidellm/backend/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(
raise ValueError("Target URL must be provided for OpenAI HTTP backend.")

if self._target.endswith("/v1") or self._target.endswith("/v1/"):
# backwards compatability, strip v1 off
# backwards compatibility, strip v1 off
self._target = self._target[:-3]

if self._target.endswith("/"):
Expand Down
47 changes: 39 additions & 8 deletions src/guidellm/benchmark/aggregator.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,13 @@
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import (
Any,
Generic,
Literal,
Optional,
TypeVar,
Union,
)
from typing import Any, Generic, Literal, Optional, TypeVar, Union, get_args

from pydantic import Field

from guidellm.backend import ResponseSummary
from guidellm.benchmark.benchmark import (
REASON_STATUS_MAPPING,
BenchmarkArgs,
BenchmarkRunStats,
BenchmarkT,
Expand All @@ -40,6 +34,7 @@
SchedulerRequestResult,
WorkerDescription,
)
from guidellm.scheduler.result import TerminationReason
from guidellm.utils import check_load_processor

__all__ = [
Expand Down Expand Up @@ -305,6 +300,24 @@ class BenchmarkAggregator(
total=None,
),
)
current_window: int = Field(
description=(
"The current accumulated window size for error checking. "
"This is a number between 0 and the value of "
"GUIDELLM__ERROR_CHECK_WINDOW_SIZE"
),
default=0,
)
errors_in_window: int = Field(
description=("The amount of errored requests in the current window."),
default=0,
)
termination_reason: TerminationReason = Field(
description=(
f"The benchmark termination reason, one of: {get_args(TerminationReason)}"
),
default="interrupted",
)

def add_result(
self,
Expand Down Expand Up @@ -600,6 +613,8 @@ def compile(self) -> GenerativeBenchmark:
"""
successful, incomplete, errored = self._compile_results()

error_rate, window_error_rate = self._calculate_error_rate()

return GenerativeBenchmark.from_stats(
run_id=self.run_id,
successful=successful,
Expand All @@ -625,12 +640,28 @@ def compile(self) -> GenerativeBenchmark:
request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
request_time_delay_avg=self.requests_stats.request_time_delay.mean,
request_time_avg=self.requests_stats.request_time.mean,
error_rate=error_rate,
window_error_rate=window_error_rate,
status=REASON_STATUS_MAPPING[self.termination_reason],
termination_reason=self.termination_reason,
),
worker=self.worker_description,
requests_loader=self.request_loader_description,
extras=self.extras,
)

def _calculate_error_rate(self) -> tuple[float, float]:
total_successful = self.requests_stats.totals.successful.total
total_errored = self.requests_stats.totals.errored.total
total_finished = total_errored + total_successful
error_rate = 0.0 if total_finished == 0 else (total_errored / total_finished)
window_error_rate = (
0.0
if self.current_window == 0
else self.errors_in_window / self.current_window
)
return error_rate, window_error_rate

def _compile_results(
self,
) -> tuple[
Expand Down
42 changes: 41 additions & 1 deletion src/guidellm/benchmark/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import random
import uuid
from typing import Any, Literal, Optional, TypeVar, Union
from typing import Any, Literal, Optional, TypeVar, Union, get_args

from pydantic import Field, computed_field

Expand Down Expand Up @@ -32,6 +32,7 @@
ThroughputStrategy,
WorkerDescription,
)
from guidellm.scheduler.result import TerminationReason

__all__ = [
"Benchmark",
Expand All @@ -46,6 +47,14 @@
"StatusBreakdown",
]

BenchmarkStatus = Literal["success", "error", "interrupted"]
REASON_STATUS_MAPPING: dict[TerminationReason, BenchmarkStatus] = {
"interrupted": "interrupted",
"max_error_reached": "error",
"max_seconds_reached": "success",
"max_requests_reached": "success",
}


class BenchmarkArgs(StandardBaseModel):
"""
Expand Down Expand Up @@ -90,6 +99,9 @@ class BenchmarkArgs(StandardBaseModel):
max_duration: Optional[float] = Field(
description="The maximum duration in seconds to run this benchmark, if any."
)
max_error: Optional[float] = Field(
description="Maximum error rate or const after which a benchmark will stop."
)
warmup_number: Optional[int] = Field(
description=(
"The number of requests to run for the warmup phase of this benchmark, "
Expand Down Expand Up @@ -213,6 +225,34 @@ class BenchmarkRunStats(StandardBaseModel):
"it was completed."
)
)
error_rate: float = Field(
description=(
"The number of total errored requests divided by the number "
"of total successful and errored requests at the end of benchmark. "
)
)
window_error_rate: float = Field(
description=(
"The number of errored requests within the error checking window"
"divided by the window size at the end of benchmark. "
"If the window_error_rate is above the max_error "
"the termination_reason should be 'max_error_reached'. "
"You may configure the error checking window size by setting "
"the environment variable GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
)
)
status: BenchmarkStatus = Field(
description=(
f"The status of the benchmark output, "
f"one of the following options: {get_args(BenchmarkStatus)}."
)
)
termination_reason: TerminationReason = Field(
description=(
"The reason for the benchmark termination, "
f"one of the following options: {get_args(TerminationReason)}."
)
)


class BenchmarkMetrics(StandardBaseModel):
Expand Down
17 changes: 17 additions & 0 deletions src/guidellm/benchmark/benchmarker.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
description="Maximum duration (in seconds) to process requests per strategy.",
ge=0,
)
max_error_per_strategy: Optional[float] = Field(
description="Maximum error after which a "
"benchmark will stop,"
" either rate or fixed number",
ge=0,
)
warmup_percent_per_strategy: Optional[float] = Field(
description="Percentage of requests to use for warmup.",
ge=0,
Expand All @@ -99,6 +105,10 @@ def max_number(self) -> Optional[int]:
def max_duration(self) -> Optional[float]:
return self.max_duration_per_strategy

@property
def max_error(self) -> Optional[float]:
return self.max_error_per_strategy

@property
def warmup_number(self) -> Optional[int]:
if self.warmup_percent_per_strategy is None or self.max_number is None:
Expand Down Expand Up @@ -148,6 +158,7 @@ async def run(
profile: Profile,
max_number_per_strategy: Optional[int],
max_duration_per_strategy: Optional[float],
max_error_per_strategy: Optional[float],
warmup_percent_per_strategy: Optional[float],
cooldown_percent_per_strategy: Optional[float],
) -> AsyncGenerator[
Expand All @@ -162,6 +173,7 @@ async def run(
requests_loader_size=requests_loader_size,
max_number_per_strategy=max_number_per_strategy,
max_duration_per_strategy=max_duration_per_strategy,
max_error_per_strategy=max_error_per_strategy,
warmup_percent_per_strategy=warmup_percent_per_strategy,
cooldown_percent_per_strategy=cooldown_percent_per_strategy,
)
Expand Down Expand Up @@ -196,6 +208,7 @@ async def run(
scheduling_strategy=scheduling_strategy,
max_number=max_number_per_strategy,
max_duration=max_duration_per_strategy,
max_error=max_error_per_strategy,
):
if result.type_ == "run_start":
yield BenchmarkerResult(
Expand All @@ -210,6 +223,9 @@ async def run(
current_result=None,
)
elif result.type_ == "run_complete":
aggregator.termination_reason = result.run_info.termination_reason
aggregator.current_window = result.run_info.current_window
aggregator.errors_in_window = result.run_info.errors_in_window
yield BenchmarkerResult(
type_="scheduler_complete",
start_time=start_time,
Expand Down Expand Up @@ -321,6 +337,7 @@ def create_benchmark_aggregator(
strategy=strategy,
max_number=limits.max_number,
max_duration=limits.max_duration,
max_error=limits.max_error,
warmup_number=limits.warmup_number,
warmup_duration=limits.warmup_duration,
cooldown_number=limits.cooldown_number,
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/benchmark/entrypoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ async def benchmark_generative_text(
rate: Optional[Union[float, list[float]]],
max_seconds: Optional[float],
max_requests: Optional[int],
max_error: Optional[float],
warmup_percent: Optional[float],
cooldown_percent: Optional[float],
output_path: Optional[Union[str, Path]],
Expand Down Expand Up @@ -119,6 +120,7 @@ async def benchmark_generative_text(
profile=profile,
max_number_per_strategy=max_requests,
max_duration_per_strategy=max_seconds,
max_error_per_strategy=max_error,
warmup_percent_per_strategy=warmup_percent,
cooldown_percent_per_strategy=cooldown_percent,
):
Expand Down
1 change: 1 addition & 0 deletions src/guidellm/benchmark/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ def benchmarks_args_str(self) -> str:
{
"max_number": args.max_number,
"max_duration": args.max_duration,
"max_error": args.max_error,
"warmup_number": args.warmup_number,
"warmup_duration": args.warmup_duration,
"cooldown_number": args.cooldown_number,
Expand Down
1 change: 1 addition & 0 deletions src/guidellm/benchmark/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ class Config:
] = None
max_seconds: Optional[PositiveFloat] = None
max_requests: Optional[PositiveInt] = None
max_error: Optional[PositiveFloat] = None
warmup_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None
cooldown_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None
output_sampling: Optional[NonNegativeInt] = None
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ class Settings(BaseSettings):
default_async_loop_sleep: float = 10e-5
logging: LoggingSettings = LoggingSettings()
default_sweep_number: int = 10
shutdown_poll_interval_seconds: float = 1
error_check_window_size: int = 30

# HTTP settings
request_follow_redirects: bool = True
Expand Down
14 changes: 14 additions & 0 deletions src/guidellm/scheduler/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
]


TerminationReason = Literal[
"interrupted", "max_error_reached", "max_seconds_reached", "max_requests_reached"
]


class SchedulerRunInfo(StandardBaseModel):
"""
Information about the current run of the scheduler.
Expand Down Expand Up @@ -46,12 +51,21 @@ class SchedulerRunInfo(StandardBaseModel):
end_number: float
processes: int
strategy: SchedulingStrategy
max_error: Optional[float] = None
current_window: int = 0
errors_in_window: int = 0

created_requests: int = 0
queued_requests: int = 0
scheduled_requests: int = 0
processing_requests: int = 0
completed_requests: int = 0
errored_requests: int = 0

# The default is "interrupted" to be fail safe, if
# the `termination_reason` logic is not reached for
# any reason - we assume it was interrupted.
termination_reason: TerminationReason = "interrupted"


class SchedulerRequestInfo(StandardBaseModel):
Expand Down
Loading