vllm-project · sfeng33 · Jul 6, 2025 · Jul 6, 2025
diff --git a/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md b/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md
@@ -30,7 +30,7 @@
 - GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
 - We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
 - CPU Models: llama-3.1 8B.
-- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99), peak memory usage (GiB), memory per request (MiB).
 - For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
 
 {serving_tests_markdown_table}

diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -118,6 +118,8 @@ Total generated tokens:                  2212
 Request throughput (req/s):              1.73      
 Output token throughput (tok/s):         382.89    
 Total Token throughput (tok/s):          619.85    
+Peak memory usage (GiB):                 1.59      
+Memory per request (MiB):                162.41 
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          71.54     
 Median TTFT (ms):                        73.88     

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -72,7 +72,11 @@
     SonnetDataset,
     VisionArenaDataset,
 )
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from benchmark_utils import (
+    convert_to_pytorch_benchmark_format,
+    get_memory_usage,
+    write_to_json,
+)
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -105,6 +109,9 @@ class BenchmarkMetrics:
     median_e2el_ms: float
     std_e2el_ms: float
     percentiles_e2el_ms: list[tuple[float, float]]
+    # Memory usage metrics
+    peak_memory_gb: float
+    memory_per_request_mb: float
 
 
 def _get_current_request_rate(
@@ -210,6 +217,7 @@ def calculate_metrics(
     selected_percentile_metrics: list[str],
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
+    peak_memory_gb: float = 0.0,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     actual_output_lens: list[int] = []
     total_input = 0
@@ -315,6 +323,8 @@ def calculate_metrics(
         percentiles_e2el_ms=[
             (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
         ],
+        peak_memory_gb=peak_memory_gb,
+        memory_per_request_mb=(peak_memory_gb * 1024) / max(completed, 1),
     )
 
     return metrics, actual_output_lens
@@ -508,6 +518,9 @@ async def limited_request_func(request_func_input, pbar):
 
     benchmark_duration = time.perf_counter() - benchmark_start_time
 
+    # Track peak memory after benchmark ends
+    peak_memory = get_memory_usage()
+
     metrics, actual_output_lens = calculate_metrics(
         input_requests=input_requests,
         outputs=outputs,
@@ -516,6 +529,7 @@ async def limited_request_func(request_func_input, pbar):
         selected_percentile_metrics=selected_percentile_metrics,
         selected_percentiles=selected_percentiles,
         goodput_config_dict=goodput_config_dict,
+        peak_memory_gb=peak_memory,
     )
 
     print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
@@ -544,6 +558,12 @@ async def limited_request_func(request_func_input, pbar):
             "Total Token throughput (tok/s):", metrics.total_token_throughput
         )
     )
+    print("{:<40} {:<10.2f}".format("Peak memory usage (GiB):", metrics.peak_memory_gb))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Memory per request (MiB):", metrics.memory_per_request_mb
+        )
+    )
 
     result = {
         "duration": benchmark_duration,
@@ -560,6 +580,8 @@ async def limited_request_func(request_func_input, pbar):
         "itls": [output.itl for output in outputs],
         "generated_texts": [output.generated_text for output in outputs],
         "errors": [output.error for output in outputs],
+        "peak_memory_gb": metrics.peak_memory_gb,
+        "memory_per_request_mb": metrics.memory_per_request_mb,
     }
 
     if rps_change_events:

diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
@@ -5,6 +5,7 @@
 import json
 import math
 import os
+import resource
 from typing import Any
 
 
@@ -72,3 +73,18 @@ def write_to_json(filename: str, records: list) -> None:
             cls=InfEncoder,
             default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
         )
+
+
+def get_memory_usage() -> float:
+    """Get peak memory usage in GiB using resource.getrusage()."""
+    # Note: ru_maxrss is in kilobytes on Linux, bytes on macOS
+    import platform
+
+    # macOS: ru_maxrss in bytes, Linux: ru_maxrss in kilobytes
+    divisor = 1 << 30 if platform.system() == "Darwin" else 1 << 20
+
+    max_self_usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / divisor
+    max_children_usage = (
+        resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss / divisor
+    )
+    return max_self_usage + max_children_usage