Skip to content

Commit 031f36d

Browse files
authored
Output support expansion, code hygiene, and tests (#117)
Expand outputs with CSV support and a more friendly table output for copying. Additionally, add code docs and tests for the output files and fix any bugs that popped up. Table format update example: ![Screenshot 2025-04-15 at 10 44 23 AM (1)](https://github.com/user-attachments/assets/34764172-f767-4f91-9165-643f1e3e30d4) CSV Example: [benchmarks.csv](https://github.com/user-attachments/files/19784363/benchmarks.csv)
1 parent cb4bf2e commit 031f36d

17 files changed

+1531
-246
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
# Output files
2+
benchmarks.json
3+
benchmarks.yaml
4+
benchmarks.csv
5+
16
# Byte-compiled / optimized / DLL files
27
__pycache__/
38
*.py[cod]

src/guidellm/__init__.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,29 @@
2121
hf_logging.set_verbosity_error()
2222
logging.getLogger("transformers").setLevel(logging.ERROR)
2323

24-
from .config import settings
24+
from .config import (
25+
settings,
26+
DatasetSettings,
27+
Environment,
28+
LoggingSettings,
29+
OpenAISettings,
30+
print_config,
31+
Settings,
32+
reload_settings,
33+
)
2534
from .logger import configure_logger, logger
2635

27-
__all__ = ["configure_logger", "logger", "settings", "generate_benchmark_report"]
36+
__all__ = [
37+
# Config
38+
"DatasetSettings",
39+
"Environment",
40+
"LoggingSettings",
41+
"OpenAISettings",
42+
"print_config",
43+
"Settings",
44+
"reload_settings",
45+
"settings",
46+
# Logger
47+
"logger",
48+
"configure_logger",
49+
]

src/guidellm/__main__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,15 @@ def cli():
210210
callback=parse_json,
211211
help="A JSON string of extra data to save with the output benchmarks",
212212
)
213+
@click.option(
214+
"--output-sampling",
215+
type=int,
216+
help=(
217+
"The number of samples to save in the output file. "
218+
"If None (default), will save all samples."
219+
),
220+
default=None,
221+
)
213222
@click.option(
214223
"--random-seed",
215224
default=42,
@@ -237,6 +246,7 @@ def benchmark(
237246
disable_console_outputs,
238247
output_path,
239248
output_extras,
249+
output_sampling,
240250
random_seed,
241251
):
242252
asyncio.run(
@@ -261,6 +271,7 @@ def benchmark(
261271
output_console=not disable_console_outputs,
262272
output_path=output_path,
263273
output_extras=output_extras,
274+
output_sampling=output_sampling,
264275
random_seed=random_seed,
265276
)
266277
)

src/guidellm/benchmark/__init__.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,19 @@
11
from .aggregator import AggregatorT, BenchmarkAggregator, GenerativeBenchmarkAggregator
2-
from .benchmark import Benchmark, BenchmarkT, GenerativeBenchmark
2+
from .benchmark import (
3+
Benchmark,
4+
BenchmarkArgs,
5+
BenchmarkMetrics,
6+
BenchmarkRunStats,
7+
BenchmarkT,
8+
GenerativeBenchmark,
9+
GenerativeMetrics,
10+
GenerativeTextErrorStats,
11+
GenerativeTextResponseStats,
12+
StatusBreakdown,
13+
)
314
from .benchmarker import Benchmarker, BenchmarkerResult, GenerativeBenchmarker
415
from .entrypoints import benchmark_generative_text
16+
from .output import GenerativeBenchmarksConsole, GenerativeBenchmarksReport
517
from .profile import (
618
AsyncProfile,
719
ConcurrentProfile,
@@ -12,17 +24,39 @@
1224
ThroughputProfile,
1325
create_profile,
1426
)
27+
from .progress import (
28+
BenchmarkerProgressDisplay,
29+
BenchmarkerTaskProgressState,
30+
GenerativeTextBenchmarkerProgressDisplay,
31+
GenerativeTextBenchmarkerTaskProgressState,
32+
)
1533

1634
__all__ = [
35+
# Aggregator
1736
"AggregatorT",
18-
"BenchmarkT",
19-
"Benchmark",
2037
"BenchmarkAggregator",
21-
"GenerativeBenchmark",
2238
"GenerativeBenchmarkAggregator",
39+
# Benchmark
40+
"Benchmark",
41+
"BenchmarkArgs",
42+
"BenchmarkMetrics",
43+
"BenchmarkRunStats",
44+
"BenchmarkT",
45+
"GenerativeBenchmark",
46+
"GenerativeMetrics",
47+
"GenerativeTextErrorStats",
48+
"GenerativeTextResponseStats",
49+
"StatusBreakdown",
50+
# Benchmarker
2351
"Benchmarker",
2452
"BenchmarkerResult",
2553
"GenerativeBenchmarker",
54+
# Entry points
55+
"benchmark_generative_text",
56+
# Output
57+
"GenerativeBenchmarksConsole",
58+
"GenerativeBenchmarksReport",
59+
# Profile
2660
"AsyncProfile",
2761
"ConcurrentProfile",
2862
"Profile",
@@ -31,5 +65,9 @@
3165
"SynchronousProfile",
3266
"ThroughputProfile",
3367
"create_profile",
34-
"benchmark_generative_text",
68+
# Progress
69+
"BenchmarkerProgressDisplay",
70+
"BenchmarkerTaskProgressState",
71+
"GenerativeTextBenchmarkerProgressDisplay",
72+
"GenerativeTextBenchmarkerTaskProgressState",
3573
]

src/guidellm/benchmark/benchmark.py

Lines changed: 39 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -457,7 +457,12 @@ def time_per_output_token_ms(self) -> Optional[float]: # type: ignore[override]
457457
This includes the time to generate the first token and all other tokens.
458458
None if the output_tokens is None or 0.
459459
"""
460-
if self.output_tokens is None or self.output_tokens == 0:
460+
if (
461+
self.output_tokens is None
462+
or self.output_tokens == 0
463+
or self.first_token_time is None
464+
or self.last_token_time is None
465+
):
461466
return None
462467

463468
return super().time_per_output_token_ms
@@ -614,41 +619,46 @@ def duration(self) -> float:
614619
),
615620
)
616621

617-
def create_sampled(self, sample_size: int) -> "GenerativeBenchmark":
622+
def set_sample_size(self, sample_size: Optional[int]) -> "GenerativeBenchmark":
618623
"""
619-
Create a new benchmark instance with a random sample of the completed and
620-
errored requests based on the given sample sizes. If the sample sizes are
621-
larger than the total number of requests, the sample sizes are capped at
622-
the total number of requests.
624+
Set the sample size for the benchmark. This will randomly sample the
625+
requests for each status type to the given sample size or the maximum
626+
number of requests for that status type, whichever is smaller.
627+
This is applied to requests.successful, requests.errored, and
628+
requests.incomplete.
629+
If None, no sampling is applied and the state is kept.
623630
624631
:param sample_size: The number of requests to sample for each status type.
625-
:return: A new benchmark instance with the sampled requests.
626-
:raises ValueError: If the sample sizes are negative.
632+
:return: The benchmark with the sampled requests.
633+
:raises ValueError: If the sample size is invalid.
627634
"""
628-
if sample_size < 0:
629-
raise ValueError(f"Sample size must be non-negative, given {sample_size}")
630635

631-
sample_size = min(sample_size, len(self.requests.successful))
632-
error_sample_size = min(sample_size, len(self.requests.errored))
633-
incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
636+
if sample_size is not None:
637+
if sample_size < 0 or not isinstance(sample_size, int):
638+
raise ValueError(
639+
f"Sample size must be non-negative integer, given {sample_size}"
640+
)
634641

635-
sampled_instance = self.model_copy()
636-
sampled_instance.requests.successful = random.sample(
637-
self.requests.successful, sample_size
638-
)
639-
sampled_instance.requests.errored = random.sample(
640-
self.requests.errored, error_sample_size
641-
)
642-
sampled_instance.requests.incomplete = random.sample(
643-
self.requests.incomplete, incomplete_sample_size
644-
)
645-
sampled_instance.request_samples = StatusBreakdown(
646-
successful=len(sampled_instance.requests.successful),
647-
incomplete=len(sampled_instance.requests.incomplete),
648-
errored=len(sampled_instance.requests.errored),
649-
)
642+
sample_size = min(sample_size, len(self.requests.successful))
643+
error_sample_size = min(sample_size, len(self.requests.errored))
644+
incomplete_sample_size = min(sample_size, len(self.requests.incomplete))
645+
646+
self.requests.successful = random.sample(
647+
self.requests.successful, sample_size
648+
)
649+
self.requests.errored = random.sample(
650+
self.requests.errored, error_sample_size
651+
)
652+
self.requests.incomplete = random.sample(
653+
self.requests.incomplete, incomplete_sample_size
654+
)
655+
self.request_samples = StatusBreakdown(
656+
successful=len(self.requests.successful),
657+
incomplete=len(self.requests.incomplete),
658+
errored=len(self.requests.errored),
659+
)
650660

651-
return sampled_instance
661+
return self
652662

653663
@staticmethod
654664
def from_stats(

src/guidellm/benchmark/entrypoints.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,16 @@
11
from pathlib import Path
2-
from typing import Any, Dict, Iterable, List, Literal, Optional, Union
2+
from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union
33

44
from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
55
from transformers import ( # type: ignore[import]
66
PreTrainedTokenizerBase,
77
)
88

99
from guidellm.backend import Backend, BackendType
10-
from guidellm.benchmark.benchmark import GenerativeBenchmark
1110
from guidellm.benchmark.benchmarker import GenerativeBenchmarker
1211
from guidellm.benchmark.output import (
1312
GenerativeBenchmarksConsole,
14-
save_generative_benchmarks,
13+
GenerativeBenchmarksReport,
1514
)
1615
from guidellm.benchmark.profile import ProfileType, create_profile
1716
from guidellm.benchmark.progress import GenerativeTextBenchmarkerProgressDisplay
@@ -48,8 +47,9 @@ async def benchmark_generative_text(
4847
output_console: bool,
4948
output_path: Optional[Union[str, Path]],
5049
output_extras: Optional[Dict[str, Any]],
50+
output_sampling: Optional[int],
5151
random_seed: int,
52-
) -> List[GenerativeBenchmark]:
52+
) -> Tuple[GenerativeBenchmarksReport, Optional[Path]]:
5353
console = GenerativeBenchmarksConsole(enabled=show_progress)
5454
console.print_line("Creating backend...")
5555
backend = Backend.create(
@@ -100,7 +100,7 @@ async def benchmark_generative_text(
100100
if show_progress
101101
else None
102102
)
103-
benchmarks = []
103+
report = GenerativeBenchmarksReport()
104104

105105
async for result in benchmarker.run(
106106
profile=profile,
@@ -115,15 +115,26 @@ async def benchmark_generative_text(
115115
if result.type_ == "benchmark_compiled":
116116
if result.current_benchmark is None:
117117
raise ValueError("Current benchmark is None")
118-
benchmarks.append(result.current_benchmark)
118+
report.benchmarks.append(
119+
result.current_benchmark.set_sample_size(output_sampling)
120+
)
119121

120122
if output_console:
121-
console.benchmarks = benchmarks
123+
orig_enabled = console.enabled
124+
console.enabled = True
125+
console.benchmarks = report.benchmarks
122126
console.print_benchmarks_metadata()
123127
console.print_benchmarks_info()
124128
console.print_benchmarks_stats()
129+
console.enabled = orig_enabled
125130

126131
if output_path:
127-
save_generative_benchmarks(benchmarks=benchmarks, path=output_path)
132+
console.print_line("\nSaving benchmarks report...")
133+
saved_path = report.save_file(output_path)
134+
console.print_line(f"Benchmarks report saved to {saved_path}")
135+
else:
136+
saved_path = None
128137

129-
return benchmarks
138+
console.print_line("\nBenchmarking complete.")
139+
140+
return report, saved_path

0 commit comments

Comments
 (0)