@@ -138,6 +138,10 @@ def log_engine_initialized(self):
138
138
139
139
140
140
class PrometheusStatLogger (StatLoggerBase ):
141
+ _gauge_cls = prometheus_client .Gauge
142
+ _counter_cls = prometheus_client .Counter
143
+ _histogram_cls = prometheus_client .Histogram
144
+ _spec_decoding_cls = SpecDecodingProm
141
145
142
146
def __init__ (self , vllm_config : VllmConfig , engine_index : int = 0 ):
143
147
self ._unregister_vllm_metrics ()
@@ -156,37 +160,37 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
156
160
157
161
max_model_len = vllm_config .model_config .max_model_len
158
162
159
- self .spec_decoding_prom = SpecDecodingProm (
163
+ self .spec_decoding_prom = self . _spec_decoding_cls (
160
164
vllm_config .speculative_config , labelnames , labelvalues )
161
165
162
166
#
163
167
# Scheduler state
164
168
#
165
- self .gauge_scheduler_running = prometheus_client . Gauge (
169
+ self .gauge_scheduler_running = self . _gauge_cls (
166
170
name = "vllm:num_requests_running" ,
167
171
documentation = "Number of requests in model execution batches." ,
168
172
labelnames = labelnames ).labels (* labelvalues )
169
173
170
- self .gauge_scheduler_waiting = prometheus_client . Gauge (
174
+ self .gauge_scheduler_waiting = self . _gauge_cls (
171
175
name = "vllm:num_requests_waiting" ,
172
176
documentation = "Number of requests waiting to be processed." ,
173
177
labelnames = labelnames ).labels (* labelvalues )
174
178
175
179
#
176
180
# GPU cache
177
181
#
178
- self .gauge_gpu_cache_usage = prometheus_client . Gauge (
182
+ self .gauge_gpu_cache_usage = self . _gauge_cls (
179
183
name = "vllm:gpu_cache_usage_perc" ,
180
184
documentation = "GPU KV-cache usage. 1 means 100 percent usage." ,
181
185
labelnames = labelnames ).labels (* labelvalues )
182
186
183
- self .counter_gpu_prefix_cache_queries = prometheus_client . Counter (
187
+ self .counter_gpu_prefix_cache_queries = self . _counter_cls (
184
188
name = "vllm:gpu_prefix_cache_queries" ,
185
189
documentation =
186
190
"GPU prefix cache queries, in terms of number of queried tokens." ,
187
191
labelnames = labelnames ).labels (* labelvalues )
188
192
189
- self .counter_gpu_prefix_cache_hits = prometheus_client . Counter (
193
+ self .counter_gpu_prefix_cache_hits = self . _counter_cls (
190
194
name = "vllm:gpu_prefix_cache_hits" ,
191
195
documentation =
192
196
"GPU prefix cache hits, in terms of number of cached tokens." ,
@@ -195,24 +199,24 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
195
199
#
196
200
# Counters
197
201
#
198
- self .counter_num_preempted_reqs = prometheus_client . Counter (
202
+ self .counter_num_preempted_reqs = self . _counter_cls (
199
203
name = "vllm:num_preemptions_total" ,
200
204
documentation = "Cumulative number of preemption from the engine." ,
201
205
labelnames = labelnames ).labels (* labelvalues )
202
206
203
- self .counter_prompt_tokens = prometheus_client . Counter (
207
+ self .counter_prompt_tokens = self . _counter_cls (
204
208
name = "vllm:prompt_tokens_total" ,
205
209
documentation = "Number of prefill tokens processed." ,
206
210
labelnames = labelnames ).labels (* labelvalues )
207
211
208
- self .counter_generation_tokens = prometheus_client . Counter (
212
+ self .counter_generation_tokens = self . _counter_cls (
209
213
name = "vllm:generation_tokens_total" ,
210
214
documentation = "Number of generation tokens processed." ,
211
215
labelnames = labelnames ).labels (* labelvalues )
212
216
213
217
self .counter_request_success : dict [FinishReason ,
214
218
prometheus_client .Counter ] = {}
215
- counter_request_success_base = prometheus_client . Counter (
219
+ counter_request_success_base = self . _counter_cls (
216
220
name = "vllm:request_success_total" ,
217
221
documentation = "Count of successfully processed requests." ,
218
222
labelnames = labelnames + ["finished_reason" ])
@@ -225,21 +229,21 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
225
229
# Histograms of counts
226
230
#
227
231
self .histogram_num_prompt_tokens_request = \
228
- prometheus_client . Histogram (
232
+ self . _histogram_cls (
229
233
name = "vllm:request_prompt_tokens" ,
230
234
documentation = "Number of prefill tokens processed." ,
231
235
buckets = build_1_2_5_buckets (max_model_len ),
232
236
labelnames = labelnames ).labels (* labelvalues )
233
237
234
238
self .histogram_num_generation_tokens_request = \
235
- prometheus_client . Histogram (
239
+ self . _histogram_cls (
236
240
name = "vllm:request_generation_tokens" ,
237
241
documentation = "Number of generation tokens processed." ,
238
242
buckets = build_1_2_5_buckets (max_model_len ),
239
243
labelnames = labelnames ).labels (* labelvalues )
240
244
241
245
self .histogram_iteration_tokens = \
242
- prometheus_client . Histogram (
246
+ self . _histogram_cls (
243
247
name = "vllm:iteration_tokens_total" ,
244
248
documentation = "Histogram of number of tokens per engine_step." ,
245
249
buckets = [
@@ -249,22 +253,22 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
249
253
labelnames = labelnames ).labels (* labelvalues )
250
254
251
255
self .histogram_max_num_generation_tokens_request = \
252
- prometheus_client . Histogram (
256
+ self . _histogram_cls (
253
257
name = "vllm:request_max_num_generation_tokens" ,
254
258
documentation =
255
259
"Histogram of maximum number of requested generation tokens." ,
256
260
buckets = build_1_2_5_buckets (max_model_len ),
257
261
labelnames = labelnames ).labels (* labelvalues )
258
262
259
263
self .histogram_n_request = \
260
- prometheus_client . Histogram (
264
+ self . _histogram_cls (
261
265
name = "vllm:request_params_n" ,
262
266
documentation = "Histogram of the n request parameter." ,
263
267
buckets = [1 , 2 , 5 , 10 , 20 ],
264
268
labelnames = labelnames ).labels (* labelvalues )
265
269
266
270
self .histogram_max_tokens_request = \
267
- prometheus_client . Histogram (
271
+ self . _histogram_cls (
268
272
name = "vllm:request_params_max_tokens" ,
269
273
documentation = "Histogram of the max_tokens request parameter." ,
270
274
buckets = build_1_2_5_buckets (max_model_len ),
@@ -274,7 +278,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
274
278
# Histogram of timing intervals
275
279
#
276
280
self .histogram_time_to_first_token = \
277
- prometheus_client . Histogram (
281
+ self . _histogram_cls (
278
282
name = "vllm:time_to_first_token_seconds" ,
279
283
documentation = "Histogram of time to first token in seconds." ,
280
284
buckets = [
@@ -285,7 +289,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
285
289
labelnames = labelnames ).labels (* labelvalues )
286
290
287
291
self .histogram_time_per_output_token = \
288
- prometheus_client . Histogram (
292
+ self . _histogram_cls (
289
293
name = "vllm:time_per_output_token_seconds" ,
290
294
documentation = "Histogram of time per output token in seconds." ,
291
295
buckets = [
@@ -299,34 +303,34 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
299
303
40.0 , 50.0 , 60.0 , 120.0 , 240.0 , 480.0 , 960.0 , 1920.0 , 7680.0
300
304
]
301
305
self .histogram_e2e_time_request = \
302
- prometheus_client . Histogram (
306
+ self . _histogram_cls (
303
307
name = "vllm:e2e_request_latency_seconds" ,
304
308
documentation = "Histogram of e2e request latency in seconds." ,
305
309
buckets = request_latency_buckets ,
306
310
labelnames = labelnames ).labels (* labelvalues )
307
311
self .histogram_queue_time_request = \
308
- prometheus_client . Histogram (
312
+ self . _histogram_cls (
309
313
name = "vllm:request_queue_time_seconds" ,
310
314
documentation =
311
315
"Histogram of time spent in WAITING phase for request." ,
312
316
buckets = request_latency_buckets ,
313
317
labelnames = labelnames ).labels (* labelvalues )
314
318
self .histogram_inference_time_request = \
315
- prometheus_client . Histogram (
319
+ self . _histogram_cls (
316
320
name = "vllm:request_inference_time_seconds" ,
317
321
documentation =
318
322
"Histogram of time spent in RUNNING phase for request." ,
319
323
buckets = request_latency_buckets ,
320
324
labelnames = labelnames ).labels (* labelvalues )
321
325
self .histogram_prefill_time_request = \
322
- prometheus_client . Histogram (
326
+ self . _histogram_cls (
323
327
name = "vllm:request_prefill_time_seconds" ,
324
328
documentation =
325
329
"Histogram of time spent in PREFILL phase for request." ,
326
330
buckets = request_latency_buckets ,
327
331
labelnames = labelnames ).labels (* labelvalues )
328
332
self .histogram_decode_time_request = \
329
- prometheus_client . Histogram (
333
+ self . _histogram_cls (
330
334
name = "vllm:request_decode_time_seconds" ,
331
335
documentation =
332
336
"Histogram of time spent in DECODE phase for request." ,
@@ -343,7 +347,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
343
347
self .labelname_running_lora_adapters = "running_lora_adapters"
344
348
self .max_lora = vllm_config .lora_config .max_loras
345
349
self .gauge_lora_info = \
346
- prometheus_client . Gauge (
350
+ self . _gauge_cls (
347
351
name = "vllm:lora_requests_info" ,
348
352
documentation = "Running stats on lora requests." ,
349
353
labelnames = [
@@ -365,7 +369,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
365
369
# Info type metrics are syntactic sugar for a gauge permanently set to 1
366
370
# Since prometheus multiprocessing mode does not support Info, emulate
367
371
# info here with a gauge.
368
- info_gauge = prometheus_client . Gauge (
372
+ info_gauge = self . _gauge_cls (
369
373
name = name ,
370
374
documentation = documentation ,
371
375
labelnames = metrics_info .keys ()).labels (** metrics_info )
0 commit comments