vllm-project · AlonKellner-RedHat · Jul 23, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -179,6 +179,8 @@ cython_debug/
 # Project specific files
 *.json
 *.yaml
+/bin
+uv.lock
 
 # But not scenarios
 !src/guidellm/benchmark/scenarios/*.json

diff --git a/README.md b/README.md
@@ -147,6 +147,8 @@ The `guidellm benchmark` command is used to run benchmarks against a generative
 
 - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
 
+- `--max-error`: The maximum error rate after which a benchmark will stop. Can either be a rate i.e 0 < rate < 1 or constant number. If rate is given and rate_type is 'constant' and 'max_seconds' exists then the rate will be calculated as part of the total expected requests counts i.e rate * duration. If rate is given and number of requests is not pre-determined than a context window of the last requests will be looked at. Context window size is configurable under GUIDELLM\_\_ERROR_CHECK_WINDOW_SIZE. If a number above 1 is given than we just count the total number of error and check if it's above the threshold.
+
 - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
 
 - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,6 +79,7 @@ dev = [
     "pytest-cov~=5.0.0",
     "pytest-mock~=3.14.0",
     "pytest-rerunfailures~=14.0",
+    "pytest-timeout~=2.4.0",
     "respx~=0.22.0",
 
     # code quality

diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
@@ -178,12 +178,28 @@ def benchmark():
         "If None, will run until max_seconds or the data is exhausted."
     ),
 )
+@click.option(
+    "--max-error",
+    type=float,
+    help=(
+        "The maximum error after which a benchmark will stop. "
+        "Can either be a rate i.e 0 < rate < 1 or constant number. "
+        "If rate is given and rate_type is 'constant' and 'max_seconds' exists "
+        "then the rate will be calculated as part of the total expected "
+        "requests count i.e rate * duration. If rate is given and number"
+        "of requests is not pre-determined than a context window "
+        "of the last requests will be looked at. Context window size"
+        "is configurable under GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
+        "If a number above 1 is given than we just count the total"
+        "number of error and check if it's above the threshold."
+    ),
+)
 @click.option(
     "--warmup-percent",
     type=float,
     default=GenerativeTextScenario.get_default("warmup_percent"),
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, "
+        "The percent of the benchmark (based on max-seconds, max-requests, "
         "or lenth of dataset) to run as a warmup and not include in the final results. "
         "Defaults to None."
     ),
@@ -193,7 +209,7 @@ def benchmark():
     type=float,
     default=GenerativeTextScenario.get_default("cooldown_percent"),
     help=(
-        "The percent of the benchmark (based on max-seconds, max-requets, or lenth "
+        "The percent of the benchmark (based on max-seconds, max-requests, or length "
         "of dataset) to run as a cooldown and not include in the final results. "
         "Defaults to None."
     ),
@@ -259,6 +275,7 @@ def run(
     rate,
     max_seconds,
     max_requests,
+    max_error,
     warmup_percent,
     cooldown_percent,
     disable_progress,
@@ -286,6 +303,7 @@ def run(
         rate=rate,
         max_seconds=max_seconds,
         max_requests=max_requests,
+        max_error=max_error,
         warmup_percent=warmup_percent,
         cooldown_percent=cooldown_percent,
         output_sampling=output_sampling,

diff --git a/src/guidellm/backend/openai.py b/src/guidellm/backend/openai.py
@@ -103,7 +103,7 @@ def __init__(
             raise ValueError("Target URL must be provided for OpenAI HTTP backend.")
 
         if self._target.endswith("/v1") or self._target.endswith("/v1/"):
-            # backwards compatability, strip v1 off
+            # backwards compatibility, strip v1 off
             self._target = self._target[:-3]
 
         if self._target.endswith("/"):

diff --git a/src/guidellm/benchmark/aggregator.py b/src/guidellm/benchmark/aggregator.py
@@ -1,19 +1,13 @@
 import time
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import (
-    Any,
-    Generic,
-    Literal,
-    Optional,
-    TypeVar,
-    Union,
-)
+from typing import Any, Generic, Literal, Optional, TypeVar, Union, get_args
 
 from pydantic import Field
 
 from guidellm.backend import ResponseSummary
 from guidellm.benchmark.benchmark import (
+    REASON_STATUS_MAPPING,
     BenchmarkArgs,
     BenchmarkRunStats,
     BenchmarkT,
@@ -40,6 +34,7 @@
     SchedulerRequestResult,
     WorkerDescription,
 )
+from guidellm.scheduler.result import TerminationReason
 from guidellm.utils import check_load_processor
 
 __all__ = [
@@ -305,6 +300,24 @@ class BenchmarkAggregator(
             total=None,
         ),
     )
+    current_window: int = Field(
+        description=(
+            "The current accumulated window size for error checking. "
+            "This is a number between 0 and the value of "
+            "GUIDELLM__ERROR_CHECK_WINDOW_SIZE"
+        ),
+        default=0,
+    )
+    errors_in_window: int = Field(
+        description=("The amount of errored requests in the current window."),
+        default=0,
+    )
+    termination_reason: TerminationReason = Field(
+        description=(
+            f"The benchmark termination reason, one of: {get_args(TerminationReason)}"
+        ),
+        default="interrupted",
+    )
 
     def add_result(
         self,
@@ -600,6 +613,8 @@ def compile(self) -> GenerativeBenchmark:
         """
         successful, incomplete, errored = self._compile_results()
 
+        error_rate, window_error_rate = self._calculate_error_rate()
+
         return GenerativeBenchmark.from_stats(
             run_id=self.run_id,
             successful=successful,
@@ -625,12 +640,28 @@ def compile(self) -> GenerativeBenchmark:
                 request_start_time_targeted_delay_avg=self.requests_stats.request_start_time_targeted_delay.mean,
                 request_time_delay_avg=self.requests_stats.request_time_delay.mean,
                 request_time_avg=self.requests_stats.request_time.mean,
+                error_rate=error_rate,
+                window_error_rate=window_error_rate,
+                status=REASON_STATUS_MAPPING[self.termination_reason],
+                termination_reason=self.termination_reason,
             ),
             worker=self.worker_description,
             requests_loader=self.request_loader_description,
             extras=self.extras,
         )
 
+    def _calculate_error_rate(self) -> tuple[float, float]:
+        total_successful = self.requests_stats.totals.successful.total
+        total_errored = self.requests_stats.totals.errored.total
+        total_finished = total_errored + total_successful
+        error_rate = 0.0 if total_finished == 0 else (total_errored / total_finished)
+        window_error_rate = (
+            0.0
+            if self.current_window == 0
+            else self.errors_in_window / self.current_window
+        )
+        return error_rate, window_error_rate
+
     def _compile_results(
         self,
     ) -> tuple[

diff --git a/src/guidellm/benchmark/benchmark.py b/src/guidellm/benchmark/benchmark.py
@@ -1,6 +1,6 @@
 import random
 import uuid
-from typing import Any, Literal, Optional, TypeVar, Union
+from typing import Any, Literal, Optional, TypeVar, Union, get_args
 
 from pydantic import Field, computed_field
 
@@ -32,6 +32,7 @@
     ThroughputStrategy,
     WorkerDescription,
 )
+from guidellm.scheduler.result import TerminationReason
 
 __all__ = [
     "Benchmark",
@@ -46,6 +47,14 @@
     "StatusBreakdown",
 ]
 
+BenchmarkStatus = Literal["success", "error", "interrupted"]
+REASON_STATUS_MAPPING: dict[TerminationReason, BenchmarkStatus] = {
+    "interrupted": "interrupted",
+    "max_error_reached": "error",
+    "max_seconds_reached": "success",
+    "max_requests_reached": "success",
+}
+
 
 class BenchmarkArgs(StandardBaseModel):
     """
@@ -90,6 +99,9 @@ class BenchmarkArgs(StandardBaseModel):
     max_duration: Optional[float] = Field(
         description="The maximum duration in seconds to run this benchmark, if any."
     )
+    max_error: Optional[float] = Field(
+        description="Maximum error rate or const after which a benchmark will stop."
+    )
     warmup_number: Optional[int] = Field(
         description=(
             "The number of requests to run for the warmup phase of this benchmark, "
@@ -213,6 +225,34 @@ class BenchmarkRunStats(StandardBaseModel):
             "it was completed."
         )
     )
+    error_rate: float = Field(
+        description=(
+            "The number of total errored requests divided by the number "
+            "of total successful and errored requests at the end of benchmark. "
+        )
+    )
+    window_error_rate: float = Field(
+        description=(
+            "The number of errored requests within the error checking window"
+            "divided by the window size at the end of benchmark. "
+            "If the window_error_rate is above the max_error "
+            "the termination_reason should be 'max_error_reached'. "
+            "You may configure the error checking window size by setting "
+            "the environment variable GUIDELLM__ERROR_CHECK_WINDOW_SIZE."
+        )
+    )
+    status: BenchmarkStatus = Field(
+        description=(
+            f"The status of the benchmark output, "
+            f"one of the following options: {get_args(BenchmarkStatus)}."
+        )
+    )
+    termination_reason: TerminationReason = Field(
+        description=(
+            "The reason for the benchmark termination, "
+            f"one of the following options: {get_args(TerminationReason)}."
+        )
+    )
 
 
 class BenchmarkMetrics(StandardBaseModel):

diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
@@ -74,6 +74,12 @@ class BenchmarkerStrategyLimits(StandardBaseModel):
         description="Maximum duration (in seconds) to process requests per strategy.",
         ge=0,
     )
+    max_error_per_strategy: Optional[float] = Field(
+        description="Maximum error after which a "
+        "benchmark will stop,"
+        " either rate or fixed number",
+        ge=0,
+    )
     warmup_percent_per_strategy: Optional[float] = Field(
         description="Percentage of requests to use for warmup.",
         ge=0,
@@ -99,6 +105,10 @@ def max_number(self) -> Optional[int]:
     def max_duration(self) -> Optional[float]:
         return self.max_duration_per_strategy
 
+    @property
+    def max_error(self) -> Optional[float]:
+        return self.max_error_per_strategy
+
     @property
     def warmup_number(self) -> Optional[int]:
         if self.warmup_percent_per_strategy is None or self.max_number is None:
@@ -148,6 +158,7 @@ async def run(
         profile: Profile,
         max_number_per_strategy: Optional[int],
         max_duration_per_strategy: Optional[float],
+        max_error_per_strategy: Optional[float],
         warmup_percent_per_strategy: Optional[float],
         cooldown_percent_per_strategy: Optional[float],
     ) -> AsyncGenerator[
@@ -162,6 +173,7 @@ async def run(
             requests_loader_size=requests_loader_size,
             max_number_per_strategy=max_number_per_strategy,
             max_duration_per_strategy=max_duration_per_strategy,
+            max_error_per_strategy=max_error_per_strategy,
             warmup_percent_per_strategy=warmup_percent_per_strategy,
             cooldown_percent_per_strategy=cooldown_percent_per_strategy,
         )
@@ -196,6 +208,7 @@ async def run(
                 scheduling_strategy=scheduling_strategy,
                 max_number=max_number_per_strategy,
                 max_duration=max_duration_per_strategy,
+                max_error=max_error_per_strategy,
             ):
                 if result.type_ == "run_start":
                     yield BenchmarkerResult(
@@ -210,6 +223,9 @@ async def run(
                         current_result=None,
                     )
                 elif result.type_ == "run_complete":
+                    aggregator.termination_reason = result.run_info.termination_reason
+                    aggregator.current_window = result.run_info.current_window
+                    aggregator.errors_in_window = result.run_info.errors_in_window
                     yield BenchmarkerResult(
                         type_="scheduler_complete",
                         start_time=start_time,
@@ -321,6 +337,7 @@ def create_benchmark_aggregator(
                 strategy=strategy,
                 max_number=limits.max_number,
                 max_duration=limits.max_duration,
+                max_error=limits.max_error,
                 warmup_number=limits.warmup_number,
                 warmup_duration=limits.warmup_duration,
                 cooldown_number=limits.cooldown_number,

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
@@ -53,6 +53,7 @@ async def benchmark_generative_text(
     rate: Optional[Union[float, list[float]]],
     max_seconds: Optional[float],
     max_requests: Optional[int],
+    max_error: Optional[float],
     warmup_percent: Optional[float],
     cooldown_percent: Optional[float],
     output_path: Optional[Union[str, Path]],
@@ -119,6 +120,7 @@ async def benchmark_generative_text(
         profile=profile,
         max_number_per_strategy=max_requests,
         max_duration_per_strategy=max_seconds,
+        max_error_per_strategy=max_error,
         warmup_percent_per_strategy=warmup_percent,
         cooldown_percent_per_strategy=cooldown_percent,
     ):

diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py
@@ -452,6 +452,7 @@ def benchmarks_args_str(self) -> str:
             {
                 "max_number": args.max_number,
                 "max_duration": args.max_duration,
+                "max_error": args.max_error,
                 "warmup_number": args.warmup_number,
                 "warmup_duration": args.warmup_duration,
                 "cooldown_number": args.cooldown_number,

diff --git a/src/guidellm/benchmark/scenario.py b/src/guidellm/benchmark/scenario.py
@@ -98,6 +98,7 @@ class Config:
     ] = None
     max_seconds: Optional[PositiveFloat] = None
     max_requests: Optional[PositiveInt] = None
+    max_error: Optional[PositiveFloat] = None
     warmup_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None
     cooldown_percent: Annotated[Optional[float], Field(gt=0, le=1)] = None
     output_sampling: Optional[NonNegativeInt] = None

diff --git a/src/guidellm/config.py b/src/guidellm/config.py
@@ -121,6 +121,8 @@ class Settings(BaseSettings):
     default_async_loop_sleep: float = 10e-5
     logging: LoggingSettings = LoggingSettings()
     default_sweep_number: int = 10
+    shutdown_poll_interval_seconds: float = 1
+    error_check_window_size: int = 30
 
     # HTTP settings
     request_follow_redirects: bool = True

diff --git a/src/guidellm/scheduler/result.py b/src/guidellm/scheduler/result.py
@@ -16,6 +16,11 @@
 ]
 
 
+TerminationReason = Literal[
+    "interrupted", "max_error_reached", "max_seconds_reached", "max_requests_reached"
+]
+
+
 class SchedulerRunInfo(StandardBaseModel):
     """
     Information about the current run of the scheduler.
@@ -46,12 +51,21 @@ class SchedulerRunInfo(StandardBaseModel):
     end_number: float
     processes: int
     strategy: SchedulingStrategy
+    max_error: Optional[float] = None
+    current_window: int = 0
+    errors_in_window: int = 0
 
     created_requests: int = 0
     queued_requests: int = 0
     scheduled_requests: int = 0
     processing_requests: int = 0
     completed_requests: int = 0
+    errored_requests: int = 0
+
+    # The default is "interrupted" to be fail safe, if
+    # the `termination_reason` logic is not reached for
+    # any reason - we assume it was interrupted.
+    termination_reason: TerminationReason = "interrupted"
 
 
 class SchedulerRequestInfo(StandardBaseModel):
-Original file line number
+Diff line change
@@ Expand Up / @@ -179,6 +179,8 @@ cython_debug/ @@
     # Project specific files
     *.json
     *.yaml
+    /bin
+    uv.lock
     # But not scenarios
     !src/guidellm/benchmark/scenarios/*.json
@@ Expand Down @@