Skip to content

[Benchmarks] Add memory tracking to serving benchmark #20519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
- CPU Models: llama-3.1 8B.
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage (GiB), memory per request (MiB).
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.

{serving_tests_markdown_table}
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
Peak memory usage (GiB): 1.59
Memory per request (MiB): 162.41
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
Expand Down
24 changes: 23 additions & 1 deletion benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@
SonnetDataset,
VisionArenaDataset,
)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from benchmark_utils import (
convert_to_pytorch_benchmark_format,
get_memory_usage,
write_to_json,
)

MILLISECONDS_TO_SECONDS_CONVERSION = 1000

Expand Down Expand Up @@ -105,6 +109,9 @@ class BenchmarkMetrics:
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: list[tuple[float, float]]
# Memory usage metrics
peak_memory_gb: float
memory_per_request_mb: float


def _get_current_request_rate(
Expand Down Expand Up @@ -210,6 +217,7 @@ def calculate_metrics(
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: dict[str, float],
peak_memory_gb: float = 0.0,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
Expand Down Expand Up @@ -315,6 +323,8 @@ def calculate_metrics(
percentiles_e2el_ms=[
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
],
peak_memory_gb=peak_memory_gb,
memory_per_request_mb=(peak_memory_gb * 1024) / max(completed, 1),
)

return metrics, actual_output_lens
Expand Down Expand Up @@ -508,6 +518,9 @@ async def limited_request_func(request_func_input, pbar):

benchmark_duration = time.perf_counter() - benchmark_start_time

# Track peak memory after benchmark ends
peak_memory = get_memory_usage()

metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests,
outputs=outputs,
Expand All @@ -516,6 +529,7 @@ async def limited_request_func(request_func_input, pbar):
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
goodput_config_dict=goodput_config_dict,
peak_memory_gb=peak_memory,
)

print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
Expand Down Expand Up @@ -544,6 +558,12 @@ async def limited_request_func(request_func_input, pbar):
"Total Token throughput (tok/s):", metrics.total_token_throughput
)
)
print("{:<40} {:<10.2f}".format("Peak memory usage (GiB):", metrics.peak_memory_gb))
print(
"{:<40} {:<10.2f}".format(
"Memory per request (MiB):", metrics.memory_per_request_mb
)
)

result = {
"duration": benchmark_duration,
Expand All @@ -560,6 +580,8 @@ async def limited_request_func(request_func_input, pbar):
"itls": [output.itl for output in outputs],
"generated_texts": [output.generated_text for output in outputs],
"errors": [output.error for output in outputs],
"peak_memory_gb": metrics.peak_memory_gb,
"memory_per_request_mb": metrics.memory_per_request_mb,
}

if rps_change_events:
Expand Down
16 changes: 16 additions & 0 deletions benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import json
import math
import os
import resource
from typing import Any


Expand Down Expand Up @@ -72,3 +73,18 @@ def write_to_json(filename: str, records: list) -> None:
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)


def get_memory_usage() -> float:
"""Get peak memory usage in GiB using resource.getrusage()."""
# Note: ru_maxrss is in kilobytes on Linux, bytes on macOS
import platform

# macOS: ru_maxrss in bytes, Linux: ru_maxrss in kilobytes
divisor = 1 << 30 if platform.system() == "Darwin" else 1 << 20

max_self_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / divisor
max_children_usage = (
resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss / divisor
)
return max_self_usage + max_children_usage