From 24bc46585b2f4554450ec966a2fb285c35067063 Mon Sep 17 00:00:00 2001 From: sfeng33 <4florafeng@gmail.com> Date: Sun, 6 Jul 2025 06:24:14 +0000 Subject: [PATCH 1/2] Add memory tracking Signed-off-by: sfeng33 <4florafeng@gmail.com> --- .../performance-benchmarks-descriptions.md | 2 +- benchmarks/README.md | 2 ++ benchmarks/benchmark_serving.py | 24 ++++++++++++++++++- benchmarks/benchmark_utils.py | 16 +++++++++++++ 4 files changed, 42 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md index a1f8441ccda..949803b03f3 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -30,7 +30,7 @@ - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 - CPU Models: llama-3.1 8B. -- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage, memory per request. - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts. {serving_tests_markdown_table} diff --git a/benchmarks/README.md b/benchmarks/README.md index fb8690d42db..4b66a574474 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -118,6 +118,8 @@ Total generated tokens: 2212 Request throughput (req/s): 1.73 Output token throughput (tok/s): 382.89 Total Token throughput (tok/s): 619.85 +Peak memory usage (GiB): 1.59 +Memory per request (MiB): 162.41 ---------------Time to First Token---------------- Mean TTFT (ms): 71.54 Median TTFT (ms): 73.88 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 9b235266dff..bed25fbfc86 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -72,7 +72,11 @@ SonnetDataset, VisionArenaDataset, ) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json +from benchmark_utils import ( + convert_to_pytorch_benchmark_format, + get_memory_usage, + write_to_json, +) MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -105,6 +109,9 @@ class BenchmarkMetrics: median_e2el_ms: float std_e2el_ms: float percentiles_e2el_ms: list[tuple[float, float]] + # Memory usage metrics + peak_memory_gb: float + memory_per_request_mb: float def _get_current_request_rate( @@ -210,6 +217,7 @@ def calculate_metrics( selected_percentile_metrics: list[str], selected_percentiles: list[float], goodput_config_dict: dict[str, float], + peak_memory_gb: float = 0.0, ) -> tuple[BenchmarkMetrics, list[int]]: actual_output_lens: list[int] = [] total_input = 0 @@ -315,6 +323,8 @@ def calculate_metrics( percentiles_e2el_ms=[ (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles ], + peak_memory_gb=peak_memory_gb, + memory_per_request_mb=(peak_memory_gb * 1024) / max(completed, 1), ) return metrics, actual_output_lens @@ -508,6 +518,9 @@ async def limited_request_func(request_func_input, pbar): benchmark_duration = time.perf_counter() - benchmark_start_time + # Track peak memory after benchmark ends + peak_memory = get_memory_usage() + metrics, actual_output_lens = calculate_metrics( input_requests=input_requests, outputs=outputs, @@ -516,6 +529,7 @@ async def limited_request_func(request_func_input, pbar): selected_percentile_metrics=selected_percentile_metrics, selected_percentiles=selected_percentiles, goodput_config_dict=goodput_config_dict, + peak_memory_gb=peak_memory, ) print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) @@ -544,6 +558,12 @@ async def limited_request_func(request_func_input, pbar): "Total Token throughput (tok/s):", metrics.total_token_throughput ) ) + print("{:<40} {:<10.2f}".format("Peak memory usage (GiB):", metrics.peak_memory_gb)) + print( + "{:<40} {:<10.2f}".format( + "Memory per request (MiB):", metrics.memory_per_request_mb + ) + ) result = { "duration": benchmark_duration, @@ -560,6 +580,8 @@ async def limited_request_func(request_func_input, pbar): "itls": [output.itl for output in outputs], "generated_texts": [output.generated_text for output in outputs], "errors": [output.error for output in outputs], + "peak_memory_gb": metrics.peak_memory_gb, + "memory_per_request_mb": metrics.memory_per_request_mb, } if rps_change_events: diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 283f938df50..32a0d4f360f 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -5,6 +5,7 @@ import json import math import os +import resource from typing import Any @@ -72,3 +73,18 @@ def write_to_json(filename: str, records: list) -> None: cls=InfEncoder, default=lambda o: f"<{type(o).__name__} object is not JSON serializable>", ) + + +def get_memory_usage() -> float: + """Get peak memory usage in GiB using resource.getrusage().""" + # Note: ru_maxrss is in kilobytes on Linux, bytes on macOS + import platform + + # macOS: ru_maxrss in bytes, Linux: ru_maxrss in kilobytes + divisor = 1 << 30 if platform.system() == "Darwin" else 1 << 20 + + max_self_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / divisor + max_children_usage = ( + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss / divisor + ) + return max_self_usage + max_children_usage From bb9b3b3acd325a0b00d19f1f5771e397d5744cc4 Mon Sep 17 00:00:00 2001 From: Flora Feng <4florafeng@gmail.com> Date: Sat, 5 Jul 2025 23:30:29 -0700 Subject: [PATCH 2/2] Update .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- .../nightly-benchmarks/performance-benchmarks-descriptions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md index 949803b03f3..6efcc67e370 100644 --- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md +++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md @@ -30,7 +30,7 @@ - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B. - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2 - CPU Models: llama-3.1 8B. -- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage, memory per request. +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage (GiB), memory per request (MiB). - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts. {serving_tests_markdown_table}