[Bugfix]: Fix Promethus spec decode counter sum-of-sums

Adam Lugowski · Adam Lugowski · commit f51448f56f0c · 2025-03-24T13:59:51.000-07:00
The Prometheus spec decode counters (draft/accepted/emitted token counts) are incremented by the values in spec_decode_metrics. However, those values are aggregates since startup. Therefore, the Prometheus counters are effectively a sum-of-sums instead of just a sum.

If a high-traffic vLLM is left on for a few hours those counters start to suggest absurdly high values like a TPS in the tens of millions.

Signed-off-by: Adam Lugowski &lt;adam.lugowski@parasail.io&gt;
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from copy import copy
 import time
 from typing import TYPE_CHECKING
 from typing import Counter as CollectionsCounter
@@ -669,20 +670,33 @@ def log(self, stats: Stats):
         if local_interval_elapsed(stats.now, self.last_local_log,
                                   self.local_interval):
             if self.spec_decode_metrics is not None:
+                # The counters in self.spec_decode_metrics are aggregates.
+                # The Prometheus Counters must be incremented with deltas.
+                # Keep track of the previously seen value so we can compute deltas.
+                if self.last_spec_decode_metrics is None:
+                    self.last_spec_decode_metrics = copy(self.spec_decode_metrics)
+                    self.last_spec_decode_metrics.accepted_tokens = 0
+                    self.last_spec_decode_metrics.draft_tokens = 0
+                    self.last_spec_decode_metrics.emitted_tokens = 0
+
+                snapshot = copy(self.spec_decode_metrics)
+
                 self._log_gauge(
                     self.metrics.gauge_spec_decode_draft_acceptance_rate,
                     self.spec_decode_metrics.draft_acceptance_rate)
                 self._log_gauge(self.metrics.gauge_spec_decode_efficiency,
                                 self.spec_decode_metrics.system_efficiency)
                 self._log_counter(
                     self.metrics.counter_spec_decode_num_accepted_tokens,
-                    self.spec_decode_metrics.accepted_tokens)
+                    snapshot.accepted_tokens - self.last_spec_decode_metrics.accepted_tokens)
                 self._log_counter(
                     self.metrics.counter_spec_decode_num_draft_tokens,
-                    self.spec_decode_metrics.draft_tokens)
+                    snapshot.draft_tokens - self.last_spec_decode_metrics.draft_tokens)
                 self._log_counter(
                     self.metrics.counter_spec_decode_num_emitted_tokens,
-                    self.spec_decode_metrics.emitted_tokens)
+                    snapshot.emitted_tokens - self.last_spec_decode_metrics.emitted_tokens)
+
+                self.last_spec_decode_metrics = snapshot
 
             # Reset tracked stats for next interval.
             self.num_prompt_tokens = []
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
@@ -80,6 +80,7 @@ def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None:
         self.last_local_log = time.time()
         self.local_interval = local_interval
         self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
+        self.last_spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None
 
     @abstractmethod
     def log(self, stats: Stats) -> None: