From f51448f56f0c056b414e68489fb8e03ab0dd4b30 Mon Sep 17 00:00:00 2001 From: Adam Lugowski Date: Mon, 10 Mar 2025 15:40:10 -0700 Subject: [PATCH 1/2] [Bugfix]: Fix Promethus spec decode counter sum-of-sums The Prometheus spec decode counters (draft/accepted/emitted token counts) are incremented by the values in spec_decode_metrics. However, those values are aggregates since startup. Therefore, the Prometheus counters are effectively a sum-of-sums instead of just a sum. If a high-traffic vLLM is left on for a few hours those counters start to suggest absurdly high values like a TPS in the tens of millions. Signed-off-by: Adam Lugowski --- vllm/engine/metrics.py | 20 +++++++++++++++++--- vllm/engine/metrics_types.py | 1 + 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index 70f36d1290ca..fcf8c095b90c 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +from copy import copy import time from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter @@ -669,6 +670,17 @@ def log(self, stats: Stats): if local_interval_elapsed(stats.now, self.last_local_log, self.local_interval): if self.spec_decode_metrics is not None: + # The counters in self.spec_decode_metrics are aggregates. + # The Prometheus Counters must be incremented with deltas. + # Keep track of the previously seen value so we can compute deltas. + if self.last_spec_decode_metrics is None: + self.last_spec_decode_metrics = copy(self.spec_decode_metrics) + self.last_spec_decode_metrics.accepted_tokens = 0 + self.last_spec_decode_metrics.draft_tokens = 0 + self.last_spec_decode_metrics.emitted_tokens = 0 + + snapshot = copy(self.spec_decode_metrics) + self._log_gauge( self.metrics.gauge_spec_decode_draft_acceptance_rate, self.spec_decode_metrics.draft_acceptance_rate) @@ -676,13 +688,15 @@ def log(self, stats: Stats): self.spec_decode_metrics.system_efficiency) self._log_counter( self.metrics.counter_spec_decode_num_accepted_tokens, - self.spec_decode_metrics.accepted_tokens) + snapshot.accepted_tokens - self.last_spec_decode_metrics.accepted_tokens) self._log_counter( self.metrics.counter_spec_decode_num_draft_tokens, - self.spec_decode_metrics.draft_tokens) + snapshot.draft_tokens - self.last_spec_decode_metrics.draft_tokens) self._log_counter( self.metrics.counter_spec_decode_num_emitted_tokens, - self.spec_decode_metrics.emitted_tokens) + snapshot.emitted_tokens - self.last_spec_decode_metrics.emitted_tokens) + + self.last_spec_decode_metrics = snapshot # Reset tracked stats for next interval. self.num_prompt_tokens = [] diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py index 9e6d5ef29bed..00902fbdddea 100644 --- a/vllm/engine/metrics_types.py +++ b/vllm/engine/metrics_types.py @@ -80,6 +80,7 @@ def __init__(self, local_interval: float, vllm_config: VllmConfig) -> None: self.last_local_log = time.time() self.local_interval = local_interval self.spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None + self.last_spec_decode_metrics: Optional[SpecDecodeWorkerMetrics] = None @abstractmethod def log(self, stats: Stats) -> None: From c90d710fd740372522a24368c24fc0e6fb6019a1 Mon Sep 17 00:00:00 2001 From: Adam Lugowski Date: Mon, 24 Mar 2025 16:06:50 -0700 Subject: [PATCH 2/2] Linter Signed-off-by: Adam Lugowski --- vllm/engine/metrics.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py index fcf8c095b90c..93d52effe9fe 100644 --- a/vllm/engine/metrics.py +++ b/vllm/engine/metrics.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from copy import copy import time +from copy import copy from typing import TYPE_CHECKING from typing import Counter as CollectionsCounter from typing import Dict, List, Optional, Type, Union, cast @@ -672,9 +672,10 @@ def log(self, stats: Stats): if self.spec_decode_metrics is not None: # The counters in self.spec_decode_metrics are aggregates. # The Prometheus Counters must be incremented with deltas. - # Keep track of the previously seen value so we can compute deltas. + # Keep track of the previous value so we can compute deltas. if self.last_spec_decode_metrics is None: - self.last_spec_decode_metrics = copy(self.spec_decode_metrics) + self.last_spec_decode_metrics = copy( + self.spec_decode_metrics) self.last_spec_decode_metrics.accepted_tokens = 0 self.last_spec_decode_metrics.draft_tokens = 0 self.last_spec_decode_metrics.emitted_tokens = 0 @@ -688,13 +689,16 @@ def log(self, stats: Stats): self.spec_decode_metrics.system_efficiency) self._log_counter( self.metrics.counter_spec_decode_num_accepted_tokens, - snapshot.accepted_tokens - self.last_spec_decode_metrics.accepted_tokens) + snapshot.accepted_tokens - + self.last_spec_decode_metrics.accepted_tokens) self._log_counter( self.metrics.counter_spec_decode_num_draft_tokens, - snapshot.draft_tokens - self.last_spec_decode_metrics.draft_tokens) + snapshot.draft_tokens - + self.last_spec_decode_metrics.draft_tokens) self._log_counter( self.metrics.counter_spec_decode_num_emitted_tokens, - snapshot.emitted_tokens - self.last_spec_decode_metrics.emitted_tokens) + snapshot.emitted_tokens - + self.last_spec_decode_metrics.emitted_tokens) self.last_spec_decode_metrics = snapshot