From 24bc46585b2f4554450ec966a2fb285c35067063 Mon Sep 17 00:00:00 2001
From: sfeng33 <4florafeng@gmail.com>
Date: Sun, 6 Jul 2025 06:24:14 +0000
Subject: [PATCH 1/2] Add memory tracking

Signed-off-by: sfeng33 <4florafeng@gmail.com>
---
 .../performance-benchmarks-descriptions.md    |  2 +-
 benchmarks/README.md                          |  2 ++
 benchmarks/benchmark_serving.py               | 24 ++++++++++++++++++-
 benchmarks/benchmark_utils.py                 | 16 +++++++++++++
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index a1f8441ccda..949803b03f3 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -30,7 +30,7 @@
 - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
-- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage, memory per request.
 - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 
 {serving_tests_markdown_table}
diff --git a/benchmarks/README.md b/benchmarks/README.md
index fb8690d42db..4b66a574474 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -118,6 +118,8 @@ Total generated tokens:                  2212
 Request throughput (req/s):              1.73      
 Output token throughput (tok/s):         382.89    
 Total Token throughput (tok/s):          619.85    
+Peak memory usage (GiB):                 1.59      
+Memory per request (MiB):                162.41 
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54     
 Median TTFT (ms):                        73.88     
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 9b235266dff..bed25fbfc86 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -72,7 +72,11 @@
     SonnetDataset,
     VisionArenaDataset,
 )
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from benchmark_utils import (
+    convert_to_pytorch_benchmark_format,
+    get_memory_usage,
+    write_to_json,
+)
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -105,6 +109,9 @@ class BenchmarkMetrics:
     median_e2el_ms: float
     std_e2el_ms: float
     percentiles_e2el_ms: list[tuple[float, float]]
+    # Memory usage metrics
+    peak_memory_gb: float
+    memory_per_request_mb: float
 
 
 def _get_current_request_rate(
@@ -210,6 +217,7 @@ def calculate_metrics(
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
+    peak_memory_gb: float = 0.0,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     actual_output_lens: list[int] = []
     total_input = 0
@@ -315,6 +323,8 @@ def calculate_metrics(
         percentiles_e2el_ms=[
             (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
         ],
+        peak_memory_gb=peak_memory_gb,
+        memory_per_request_mb=(peak_memory_gb * 1024) / max(completed, 1),
     )
 
     return metrics, actual_output_lens
@@ -508,6 +518,9 @@ async def limited_request_func(request_func_input, pbar):
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
 
+    # Track peak memory after benchmark ends
+    peak_memory = get_memory_usage()
+
     metrics, actual_output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,
@@ -516,6 +529,7 @@ async def limited_request_func(request_func_input, pbar):
         selected_percentile_metrics=selected_percentile_metrics,
         selected_percentiles=selected_percentiles,
         goodput_config_dict=goodput_config_dict,
+        peak_memory_gb=peak_memory,
     )
 
     print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
@@ -544,6 +558,12 @@ async def limited_request_func(request_func_input, pbar):
             "Total Token throughput (tok/s):", metrics.total_token_throughput
         )
     )
+    print("{:<40} {:<10.2f}".format("Peak memory usage (GiB):", metrics.peak_memory_gb))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Memory per request (MiB):", metrics.memory_per_request_mb
+        )
+    )
 
     result = {
         "duration": benchmark_duration,
@@ -560,6 +580,8 @@ async def limited_request_func(request_func_input, pbar):
         "itls": [output.itl for output in outputs],
         "generated_texts": [output.generated_text for output in outputs],
         "errors": [output.error for output in outputs],
+        "peak_memory_gb": metrics.peak_memory_gb,
+        "memory_per_request_mb": metrics.memory_per_request_mb,
     }
 
     if rps_change_events:
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 283f938df50..32a0d4f360f 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -5,6 +5,7 @@
 import json
 import math
 import os
+import resource
 from typing import Any
 
 
@@ -72,3 +73,18 @@ def write_to_json(filename: str, records: list) -> None:
             cls=InfEncoder,
             default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
         )
+
+
+def get_memory_usage() -> float:
+    """Get peak memory usage in GiB using resource.getrusage()."""
+    # Note: ru_maxrss is in kilobytes on Linux, bytes on macOS
+    import platform
+
+    # macOS: ru_maxrss in bytes, Linux: ru_maxrss in kilobytes
+    divisor = 1 << 30 if platform.system() == "Darwin" else 1 << 20
+
+    max_self_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / divisor
+    max_children_usage = (
+        resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss / divisor
+    )
+    return max_self_usage + max_children_usage

From bb9b3b3acd325a0b00d19f1f5771e397d5744cc4 Mon Sep 17 00:00:00 2001
From: Flora Feng <4florafeng@gmail.com>
Date: Sat, 5 Jul 2025 23:30:29 -0700
Subject: [PATCH 2/2] Update
 .buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 .../nightly-benchmarks/performance-benchmarks-descriptions.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
index 949803b03f3..6efcc67e370 100644
--- a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
+++ b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -30,7 +30,7 @@
 - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
-- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage, memory per request.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage (GiB), memory per request (MiB).
 - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 
 {serving_tests_markdown_table}