@@ -83,6 +83,16 @@ def __init__(self, cfg, cached_generated_tokens, engine_worker_queue,
83
83
self .number_of_output_tokens = 0
84
84
self .total_step = 0
85
85
self .speculative_stats_step = 0
86
+ self .num_draft_tokens = 0
87
+ self .num_accepted_tokens = 0
88
+ self .num_emitted_tokens = 0
89
+ self .max_num_emitted_tokens = 0
90
+ self .num_rest_requests_per_head = [
91
+ 0 ,
92
+ ] * MAX_DRAFT_TOKENS
93
+ self .num_accept_requests_per_head = [
94
+ 0 ,
95
+ ] * MAX_DRAFT_TOKENS
86
96
prefill_time_data = np .zeros ([100 ], dtype = np .float32 )
87
97
self .prefill_time_signal = IPCSignal (name = "prefill_time_signal" ,
88
98
array = prefill_time_data ,
@@ -278,24 +288,27 @@ def _recycle_resources(self,
278
288
279
289
def _compute_speculative_status (self ):
280
290
# TODO(liuzichang): Supplement more statistics
281
- interval = 10
282
- self .speculative_stats_step += 1
291
+ interval = 50
283
292
if self .speculative_stats_step % interval == 0 :
284
293
accept_ratio = 1 - self .total_step * 1.0 / self .number_of_output_tokens
285
294
spec_logger .info (
286
295
f"Speculate global accept ratio(Accept draft_tokens/Generated tokens): { accept_ratio } "
287
296
f" total step: { self .total_step } . total output token num: { self .number_of_output_tokens } "
288
297
)
289
298
290
- if self .cfg .speculative_config .method in ["mtp" ] and \
291
- self .cfg .speculative_config .num_speculative_tokens == 1 :
292
- single_head_accep_ratio = accept_ratio / (1 - accept_ratio )
293
- spec_logger .info (
294
- f" Single head accept ratio: { single_head_accep_ratio } " )
299
+ if self .cfg .speculative_config .method in ["mtp" ]:
300
+ single_head_acceptance_rates = []
301
+ for head in range (self .cfg .speculative_config .num_speculative_tokens ):
302
+ single_head_acceptance_rates .append (
303
+ self .num_accept_requests_per_head [head ]
304
+ / self .num_rest_requests_per_head [head ]
305
+ )
306
+ spec_logger .info (f" Single head accept ratio: { single_head_acceptance_rates } " )
295
307
296
308
if self .number_of_output_tokens > 1000000 :
297
309
self .number_of_output_tokens = 0
298
310
self .total_step = 0
311
+ self .speculative_stats_step += 1
299
312
300
313
def _process_sampling_with_logprob_batch_output (self ):
301
314
"""
@@ -422,6 +435,7 @@ def _process_batch_output(self):
422
435
if self .cfg .speculative_config .method :
423
436
batch = self .output_tokens [1 ]
424
437
accept_num = tokens [2 :batch + 2 ]
438
+ self ._record_speculative_decoding_mertics (accept_num )
425
439
else :
426
440
batch = self .output_tokens [1 , 0 ]
427
441
tokens = tokens [2 :batch + 2 ]
@@ -558,6 +572,70 @@ def _record_completion_metrics(self, task, current_time):
558
572
main_process_metrics .request_generation_tokens .observe (
559
573
self .tokens_counter [task .request_id ])
560
574
575
+ def _record_speculative_decoding_mertics (self , accept_num ):
576
+ """Record metrics of speculative decoding"""
577
+ if not hasattr (main_process_metrics , "spec_decode_draft_acceptance_rate" ):
578
+ main_process_metrics ._init_speculative_metrics (
579
+ self .cfg .speculative_config .method ,
580
+ self .cfg .speculative_config .num_speculative_tokens ,
581
+ )
582
+
583
+ real_accept_num = [x for x in accept_num if x != 0 ]
584
+ num_accepted_tokens = sum ([x - 1 for x in real_accept_num ])
585
+ self .num_accepted_tokens += num_accepted_tokens
586
+ num_emitted_tokens = sum (real_accept_num )
587
+ self .num_emitted_tokens += num_emitted_tokens
588
+
589
+ main_process_metrics .spec_decode_num_accepted_tokens_total .inc (
590
+ num_accepted_tokens
591
+ )
592
+ main_process_metrics .spec_decode_num_emitted_tokens_total .inc (
593
+ num_emitted_tokens
594
+ )
595
+
596
+ if self .cfg .speculative_config .method in ["ngram" ]:
597
+ main_process_metrics .spec_decode_draft_acceptance_rate .set (
598
+ self .num_accepted_tokens / self .num_emitted_tokens
599
+ )
600
+
601
+ if self .cfg .speculative_config .method in ["mtp" ]:
602
+ num_draft_tokens = (
603
+ len (real_accept_num )
604
+ * self .cfg .speculative_config .num_speculative_tokens
605
+ )
606
+ self .num_draft_tokens += num_draft_tokens
607
+
608
+ self .max_num_emitted_tokens += len (real_accept_num ) * (
609
+ self .cfg .speculative_config .num_speculative_tokens + 1
610
+ )
611
+
612
+ main_process_metrics .spec_decode_draft_acceptance_rate .set (
613
+ self .num_accepted_tokens / self .num_draft_tokens
614
+ )
615
+ main_process_metrics .spec_decode_efficiency .set (
616
+ self .num_emitted_tokens / self .max_num_emitted_tokens
617
+ )
618
+ main_process_metrics .spec_decode_num_draft_tokens_total .inc (
619
+ num_draft_tokens
620
+ )
621
+
622
+ num_rest_requests = len (real_accept_num )
623
+ for head in range (self .cfg .speculative_config .num_speculative_tokens ):
624
+ num_accept_requests = len ([x for x in real_accept_num if x >= head + 2 ])
625
+ # Accumulate the number of requests for each head
626
+ self .num_accept_requests_per_head [head ] += num_accept_requests
627
+ self .num_rest_requests_per_head [head ] += num_rest_requests
628
+ # Update the rest requests for each head
629
+ num_rest_requests = num_accept_requests
630
+ # Calculate the acceptance rate for each head
631
+ single_head_acceptance_rate = (
632
+ self .num_accept_requests_per_head [head ]
633
+ / self .num_rest_requests_per_head [head ]
634
+ )
635
+ main_process_metrics .spec_decode_draft_single_head_acceptance_rate [
636
+ head
637
+ ].set (single_head_acceptance_rate )
638
+
561
639
562
640
class WarmUpTokenProcessor (TokenProcessor ):
563
641
"""
0 commit comments