Skip to content

Commit b1023a7

Browse files
authored
time completions use case (#397)
* time use case * name * update fake
1 parent 0b97023 commit b1023a7

File tree

3 files changed

+27
-5
lines changed

3 files changed

+27
-5
lines changed

model-engine/model_engine_server/api/llms_v1.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
logger_name,
4545
make_logger,
4646
)
47+
from model_engine_server.core.utils.timer import timer
4748
from model_engine_server.domain.exceptions import (
4849
DockerImageNotFoundException,
4950
EndpointDeleteFailedException,
@@ -313,16 +314,18 @@ async def create_completion_sync_task(
313314
llm_model_endpoint_service=external_interfaces.llm_model_endpoint_service,
314315
tokenizer_repository=external_interfaces.tokenizer_repository,
315316
)
316-
response = await use_case.execute(
317-
user=auth, model_endpoint_name=model_endpoint_name, request=request
318-
)
317+
with timer() as use_case_timer:
318+
response = await use_case.execute(
319+
user=auth, model_endpoint_name=model_endpoint_name, request=request
320+
)
319321
background_tasks.add_task(
320322
external_interfaces.monitoring_metrics_gateway.emit_token_count_metrics,
321323
TokenUsage(
322324
num_prompt_tokens=response.output.num_prompt_tokens if response.output else None,
323325
num_completion_tokens=response.output.num_completion_tokens
324326
if response.output
325327
else None,
328+
total_duration=use_case_timer.duration,
326329
),
327330
metric_metadata,
328331
)
@@ -374,15 +377,17 @@ async def create_completion_stream_task(
374377

375378
async def event_generator():
376379
try:
377-
async for message in response:
378-
yield {"data": message.json()}
380+
with timer() as use_case_timer:
381+
async for message in response:
382+
yield {"data": message.json()}
379383
background_tasks.add_task(
380384
external_interfaces.monitoring_metrics_gateway.emit_token_count_metrics,
381385
TokenUsage(
382386
num_prompt_tokens=message.output.num_prompt_tokens if message.output else None,
383387
num_completion_tokens=message.output.num_completion_tokens
384388
if message.output
385389
else None,
390+
total_duration=use_case_timer.duration,
386391
),
387392
metric_metadata,
388393
)

model-engine/model_engine_server/common/dtos/llms.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,13 +280,27 @@ class CompletionStreamV1Response(BaseModel):
280280

281281

282282
class TokenUsage(BaseModel):
283+
"""
284+
Token usage for a prompt completion task.
285+
"""
286+
283287
num_prompt_tokens: Optional[int] = 0
284288
num_completion_tokens: Optional[int] = 0
289+
total_duration: Optional[float] = None
290+
"""Includes time spent waiting for the model to be ready."""
285291

286292
@property
287293
def num_total_tokens(self) -> int:
288294
return (self.num_prompt_tokens or 0) + (self.num_completion_tokens or 0)
289295

296+
@property
297+
def total_tokens_per_second(self) -> float:
298+
return (
299+
self.num_total_tokens / self.total_duration
300+
if self.total_duration and self.total_duration > 0
301+
else 0.0
302+
)
303+
290304

291305
class CreateFineTuneRequest(BaseModel):
292306
model: str

model-engine/model_engine_server/infra/gateways/fake_monitoring_metrics_gateway.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def __init__(self):
2121
self.database_cache_miss = 0
2222
self.route_call = defaultdict(int)
2323
self.token_count = 0
24+
self.total_tokens_per_second = 0
2425

2526
def reset(self):
2627
self.attempted_build = 0
@@ -35,6 +36,7 @@ def reset(self):
3536
self.database_cache_miss = 0
3637
self.route_call = defaultdict(int)
3738
self.token_count = 0
39+
self.total_tokens_per_second = 0
3840

3941
def emit_attempted_build_metric(self):
4042
self.attempted_build += 1
@@ -71,3 +73,4 @@ def emit_route_call_metric(self, route: str, _metadata: MetricMetadata):
7173

7274
def emit_token_count_metrics(self, token_usage: TokenUsage, _metadata: MetricMetadata):
7375
self.token_count += token_usage.num_total_tokens
76+
self.total_tokens_per_second = token_usage.total_tokens_per_second

0 commit comments

Comments
 (0)