Skip to content

Commit 42d4001

Browse files
authored
[Features] Add speculative metrics (#2857)
1 parent 52aca23 commit 42d4001

File tree

2 files changed

+164
-10
lines changed

2 files changed

+164
-10
lines changed

fastdeploy/metrics/metrics.py

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,6 @@ def get_filtered_metrics(exclude_names: Set[str], extra_register_func=None) -> s
107107
]
108108

109109

110-
111-
112110
class MetricsManager:
113111
"""Prometheus Metrics Manager handles all metric updates """
114112

@@ -126,6 +124,12 @@ class MetricsManager:
126124
request_decode_time: 'Histogram'
127125
request_generation_tokens: 'Histogram'
128126
request_success_total: 'Counter'
127+
spec_decode_draft_acceptance_rate: 'Gauge'
128+
spec_decode_efficiency: 'Gauge'
129+
spec_decode_num_accepted_tokens_total: 'Counter'
130+
spec_decode_num_draft_tokens_total: 'Counter'
131+
spec_decode_num_emitted_tokens_total: 'Counter'
132+
spec_decode_draft_single_head_acceptance_rate: 'list[Gauge]'
129133

130134
# 定义所有指标配置
131135
METRICS = {
@@ -216,8 +220,9 @@ class MetricsManager:
216220
'name': 'fastdeploy:request_success_total',
217221
'description': 'Total number of successfully processed requests',
218222
'kwargs': {}
219-
}
223+
},
220224
}
225+
SPECULATIVE_METRICS = {}
221226

222227
def __init__(self):
223228
"""Initializes the Prometheus metrics and starts the HTTP server if not already initialized."""
@@ -229,6 +234,75 @@ def __init__(self):
229234
**config['kwargs']
230235
))
231236

237+
def _init_speculative_metrics(self, speculative_method, num_speculative_tokens):
238+
self.SPECULATIVE_METRICS = {
239+
"spec_decode_draft_acceptance_rate": {
240+
"type": Gauge,
241+
"name": "fastdeploy:spec_decode_draft_acceptance_rate",
242+
"description": "Acceptance rate of speculative decoding",
243+
"kwargs": {},
244+
},
245+
"spec_decode_num_accepted_tokens_total": {
246+
"type": Counter,
247+
"name": "fastdeploy:spec_decode_num_accepted_tokens_total",
248+
"description": "Total number of tokens accepted by the scoring model and verification program",
249+
"kwargs": {},
250+
},
251+
"spec_decode_num_emitted_tokens_total": {
252+
"type": Counter,
253+
"name": "fastdeploy:spec_decode_num_emitted_tokens_total",
254+
"description": "Total number of tokens output by the entire system",
255+
"kwargs": {},
256+
},
257+
}
258+
if speculative_method == "mtp":
259+
self.SPECULATIVE_METRICS["spec_decode_efficiency"]={
260+
"type": Gauge,
261+
"name": "fastdeploy:spec_decode_efficiency",
262+
"description": "Efficiency of speculative decoding",
263+
"kwargs": {},
264+
}
265+
self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"]={
266+
"type": Counter,
267+
"name": "fastdeploy:spec_decode_num_draft_tokens_total",
268+
"description": "Total number of speculative tokens generated by the proposal method",
269+
"kwargs": {},
270+
}
271+
self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"]={
272+
"type": list[Gauge],
273+
"name": "fastdeploy:spec_decode_draft_single_head_acceptance_rate",
274+
"description": "Single head acceptance rate of speculative decoding",
275+
"kwargs": {},
276+
}
277+
for metric_name, config in self.SPECULATIVE_METRICS.items():
278+
if metric_name == "spec_decode_draft_single_head_acceptance_rate":
279+
gauges = []
280+
for i in range(num_speculative_tokens):
281+
gauges.append(
282+
Gauge(
283+
f"{config['name']}_{i}",
284+
f"{config['description']} (head {i})",
285+
)
286+
)
287+
setattr(self, metric_name, gauges)
288+
else:
289+
setattr(
290+
self,
291+
metric_name,
292+
config["type"](
293+
config["name"], config["description"], **config["kwargs"]
294+
),
295+
)
296+
297+
def register_speculative_metrics(self, registry: CollectorRegistry):
298+
"""Register all speculative metrics to the specified registry"""
299+
for metric_name in self.SPECULATIVE_METRICS:
300+
if metric_name == "spec_decode_draft_single_head_acceptance_rate":
301+
for gauge in getattr(self, metric_name):
302+
registry.register(gauge)
303+
else:
304+
registry.register(getattr(self, metric_name))
305+
232306
def register_all(self, registry: CollectorRegistry, workers: int = 1):
233307
"""Register all metrics to the specified registry"""
234308
for metric_name in self.METRICS:
@@ -238,6 +312,8 @@ def register_all(self, registry: CollectorRegistry, workers: int = 1):
238312
registry.register(work_process_metrics.request_params_max_tokens)
239313
registry.register(work_process_metrics.prompt_tokens_total)
240314
registry.register(work_process_metrics.request_prompt_tokens)
315+
if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
316+
self.register_speculative_metrics(registry)
241317

242318
@classmethod
243319
def get_excluded_metrics(cls) -> Set[str]:

fastdeploy/output/token_processor.py

Lines changed: 85 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,16 @@ def __init__(self, cfg, cached_generated_tokens, engine_worker_queue,
8383
self.number_of_output_tokens = 0
8484
self.total_step = 0
8585
self.speculative_stats_step = 0
86+
self.num_draft_tokens = 0
87+
self.num_accepted_tokens = 0
88+
self.num_emitted_tokens = 0
89+
self.max_num_emitted_tokens = 0
90+
self.num_rest_requests_per_head = [
91+
0,
92+
] * MAX_DRAFT_TOKENS
93+
self.num_accept_requests_per_head = [
94+
0,
95+
] * MAX_DRAFT_TOKENS
8696
prefill_time_data = np.zeros([100], dtype=np.float32)
8797
self.prefill_time_signal = IPCSignal(name="prefill_time_signal",
8898
array=prefill_time_data,
@@ -278,24 +288,27 @@ def _recycle_resources(self,
278288

279289
def _compute_speculative_status(self):
280290
# TODO(liuzichang): Supplement more statistics
281-
interval = 10
282-
self.speculative_stats_step += 1
291+
interval = 50
283292
if self.speculative_stats_step % interval == 0:
284293
accept_ratio = 1 - self.total_step * 1.0 / self.number_of_output_tokens
285294
spec_logger.info(
286295
f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): {accept_ratio}"
287296
f" total step: {self.total_step}. total output token num: {self.number_of_output_tokens}"
288297
)
289298

290-
if self.cfg.speculative_config.method in ["mtp"] and \
291-
self.cfg.speculative_config.num_speculative_tokens == 1:
292-
single_head_accep_ratio = accept_ratio / (1 - accept_ratio)
293-
spec_logger.info(
294-
f" Single head accept ratio: {single_head_accep_ratio}")
299+
if self.cfg.speculative_config.method in ["mtp"]:
300+
single_head_acceptance_rates = []
301+
for head in range(self.cfg.speculative_config.num_speculative_tokens):
302+
single_head_acceptance_rates.append(
303+
self.num_accept_requests_per_head[head]
304+
/ self.num_rest_requests_per_head[head]
305+
)
306+
spec_logger.info(f" Single head accept ratio: {single_head_acceptance_rates}")
295307

296308
if self.number_of_output_tokens > 1000000:
297309
self.number_of_output_tokens = 0
298310
self.total_step = 0
311+
self.speculative_stats_step += 1
299312

300313
def _process_sampling_with_logprob_batch_output(self):
301314
"""
@@ -422,6 +435,7 @@ def _process_batch_output(self):
422435
if self.cfg.speculative_config.method:
423436
batch = self.output_tokens[1]
424437
accept_num = tokens[2:batch + 2]
438+
self._record_speculative_decoding_mertics(accept_num)
425439
else:
426440
batch = self.output_tokens[1, 0]
427441
tokens = tokens[2:batch + 2]
@@ -558,6 +572,70 @@ def _record_completion_metrics(self, task, current_time):
558572
main_process_metrics.request_generation_tokens.observe(
559573
self.tokens_counter[task.request_id])
560574

575+
def _record_speculative_decoding_mertics(self, accept_num):
576+
"""Record metrics of speculative decoding"""
577+
if not hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
578+
main_process_metrics._init_speculative_metrics(
579+
self.cfg.speculative_config.method,
580+
self.cfg.speculative_config.num_speculative_tokens,
581+
)
582+
583+
real_accept_num = [x for x in accept_num if x != 0]
584+
num_accepted_tokens = sum([x - 1 for x in real_accept_num])
585+
self.num_accepted_tokens += num_accepted_tokens
586+
num_emitted_tokens = sum(real_accept_num)
587+
self.num_emitted_tokens += num_emitted_tokens
588+
589+
main_process_metrics.spec_decode_num_accepted_tokens_total.inc(
590+
num_accepted_tokens
591+
)
592+
main_process_metrics.spec_decode_num_emitted_tokens_total.inc(
593+
num_emitted_tokens
594+
)
595+
596+
if self.cfg.speculative_config.method in ["ngram"]:
597+
main_process_metrics.spec_decode_draft_acceptance_rate.set(
598+
self.num_accepted_tokens / self.num_emitted_tokens
599+
)
600+
601+
if self.cfg.speculative_config.method in ["mtp"]:
602+
num_draft_tokens = (
603+
len(real_accept_num)
604+
* self.cfg.speculative_config.num_speculative_tokens
605+
)
606+
self.num_draft_tokens += num_draft_tokens
607+
608+
self.max_num_emitted_tokens += len(real_accept_num) * (
609+
self.cfg.speculative_config.num_speculative_tokens + 1
610+
)
611+
612+
main_process_metrics.spec_decode_draft_acceptance_rate.set(
613+
self.num_accepted_tokens / self.num_draft_tokens
614+
)
615+
main_process_metrics.spec_decode_efficiency.set(
616+
self.num_emitted_tokens / self.max_num_emitted_tokens
617+
)
618+
main_process_metrics.spec_decode_num_draft_tokens_total.inc(
619+
num_draft_tokens
620+
)
621+
622+
num_rest_requests = len(real_accept_num)
623+
for head in range(self.cfg.speculative_config.num_speculative_tokens):
624+
num_accept_requests = len([x for x in real_accept_num if x >= head + 2])
625+
# Accumulate the number of requests for each head
626+
self.num_accept_requests_per_head[head] += num_accept_requests
627+
self.num_rest_requests_per_head[head] += num_rest_requests
628+
# Update the rest requests for each head
629+
num_rest_requests = num_accept_requests
630+
# Calculate the acceptance rate for each head
631+
single_head_acceptance_rate = (
632+
self.num_accept_requests_per_head[head]
633+
/ self.num_rest_requests_per_head[head]
634+
)
635+
main_process_metrics.spec_decode_draft_single_head_acceptance_rate[
636+
head
637+
].set(single_head_acceptance_rate)
638+
561639

562640
class WarmUpTokenProcessor(TokenProcessor):
563641
"""

0 commit comments

Comments
 (0)