Skip to content

Commit 0efe79e

Browse files
committed
add
Signed-off-by: Amog Kamsetty <amogkamsetty@gmail.com>
1 parent 572e8fb commit 0efe79e

File tree

2 files changed

+17
-0
lines changed

2 files changed

+17
-0
lines changed

vllm/entrypoints/openai/api_server.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,6 +1197,7 @@ async def init_app_state(
11971197
state.openai_serving_models,
11981198
request_logger=request_logger,
11991199
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
1200+
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
12001201
) if model_config.runner_type == "generate" else None
12011202
state.openai_serving_pooling = OpenAIServingPooling(
12021203
engine_client,

vllm/entrypoints/openai/serving_completion.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
CompletionResponseStreamChoice,
2424
CompletionStreamResponse,
2525
ErrorResponse,
26+
PromptTokenUsageInfo,
2627
RequestResponseMetadata,
2728
UsageInfo)
2829
# yapf: enable
@@ -52,12 +53,14 @@ def __init__(
5253
*,
5354
request_logger: Optional[RequestLogger],
5455
return_tokens_as_token_ids: bool = False,
56+
enable_prompt_tokens_details: bool = False,
5557
):
5658
super().__init__(engine_client=engine_client,
5759
model_config=model_config,
5860
models=models,
5961
request_logger=request_logger,
6062
return_tokens_as_token_ids=return_tokens_as_token_ids)
63+
self.enable_prompt_tokens_details = enable_prompt_tokens_details
6164
self.default_sampling_params = (
6265
self.model_config.get_diff_sampling_param())
6366
if self.default_sampling_params:
@@ -297,6 +300,7 @@ async def completion_stream_generator(
297300
previous_num_tokens = [0] * num_choices * num_prompts
298301
has_echoed = [False] * num_choices * num_prompts
299302
num_prompt_tokens = [0] * num_prompts
303+
num_cached_tokens = [0] * num_prompts
300304

301305
stream_options = request.stream_options
302306
if stream_options:
@@ -311,11 +315,15 @@ async def completion_stream_generator(
311315
prompt_token_ids = res.prompt_token_ids
312316
prompt_logprobs = res.prompt_logprobs
313317
prompt_text = res.prompt
318+
cached_tokens = res.num_cached_tokens
314319

315320
# Prompt details are excluded from later streamed outputs
316321
if prompt_token_ids is not None:
317322
num_prompt_tokens[prompt_idx] = len(prompt_token_ids)
318323

324+
if cached_tokens is not None:
325+
num_cached_tokens[prompt_idx] = cached_tokens
326+
319327
delta_token_ids: GenericSequence[int]
320328
out_logprobs: Optional[GenericSequence[Optional[dict[
321329
int, Logprob]]]]
@@ -402,10 +410,15 @@ async def completion_stream_generator(
402410

403411
total_prompt_tokens = sum(num_prompt_tokens)
404412
total_completion_tokens = sum(previous_num_tokens)
413+
total_cached_tokens = sum(num_cached_tokens)
405414
final_usage_info = UsageInfo(
406415
prompt_tokens=total_prompt_tokens,
407416
completion_tokens=total_completion_tokens,
408417
total_tokens=total_prompt_tokens + total_completion_tokens)
418+
if self.enable_prompt_tokens_details and total_cached_tokens:
419+
final_usage_info.prompt_tokens_details = PromptTokenUsageInfo(
420+
cached_tokens=total_cached_tokens
421+
)
409422

410423
if include_usage:
411424
final_usage_chunk = CompletionStreamResponse(
@@ -510,6 +523,9 @@ def request_output_to_completion_response(
510523
completion_tokens=num_generated_tokens,
511524
total_tokens=num_prompt_tokens + num_generated_tokens,
512525
)
526+
if self.enable_prompt_tokens_details and final_res_batch[0].num_cached_tokens:
527+
usage.prompt_tokens_details = PromptTokenUsageInfo(
528+
cached_tokens=final_res_batch[0].num_cached_tokens)
513529

514530
request_metadata.final_usage_info = usage
515531

0 commit comments

Comments
 (0)