Skip to content

Commit 3422535

Browse files
authored
[Feat][CLI] enforce-include-usage (vllm-project#19695)
Signed-off-by: Max Wittig <max.wittig@siemens.com> Signed-off-by: Will Eaton <weaton@redhat.com>
1 parent c38dac5 commit 3422535

File tree

5 files changed

+34
-9
lines changed

5 files changed

+34
-9
lines changed

vllm/entrypoints/openai/api_server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1190,13 +1190,15 @@ async def init_app_state(
11901190
tool_parser=args.tool_call_parser,
11911191
reasoning_parser=args.reasoning_parser,
11921192
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
1193+
enable_force_include_usage=args.enable_force_include_usage,
11931194
) if model_config.runner_type == "generate" else None
11941195
state.openai_serving_completion = OpenAIServingCompletion(
11951196
engine_client,
11961197
model_config,
11971198
state.openai_serving_models,
11981199
request_logger=request_logger,
11991200
return_tokens_as_token_ids=args.return_tokens_as_token_ids,
1201+
enable_force_include_usage=args.enable_force_include_usage,
12001202
) if model_config.runner_type == "generate" else None
12011203
state.openai_serving_pooling = OpenAIServingPooling(
12021204
engine_client,

vllm/entrypoints/openai/cli_args.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
272272
action='store_true',
273273
default=False,
274274
help="If set to True, enable prompt_tokens_details in usage.")
275+
parser.add_argument(
276+
"--enable-force-include-usage",
277+
action='store_true',
278+
default=False,
279+
help="If set to True, including usage on every request.")
275280
parser.add_argument(
276281
"--enable-server-load-tracking",
277282
action='store_true',

vllm/entrypoints/openai/serving_chat.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,14 @@ def __init__(
6464
enable_auto_tools: bool = False,
6565
tool_parser: Optional[str] = None,
6666
enable_prompt_tokens_details: bool = False,
67+
enable_force_include_usage: bool = False,
6768
) -> None:
6869
super().__init__(engine_client=engine_client,
6970
model_config=model_config,
7071
models=models,
7172
request_logger=request_logger,
72-
return_tokens_as_token_ids=return_tokens_as_token_ids)
73+
return_tokens_as_token_ids=return_tokens_as_token_ids,
74+
enable_force_include_usage=enable_force_include_usage)
7375

7476
self.response_role = response_role
7577
self.chat_template = chat_template
@@ -110,6 +112,7 @@ def __init__(
110112
"been registered") from e
111113

112114
self.enable_prompt_tokens_details = enable_prompt_tokens_details
115+
self.enable_force_include_usage = enable_force_include_usage
113116
self.default_sampling_params = (
114117
self.model_config.get_diff_sampling_param())
115118
if self.default_sampling_params:
@@ -261,8 +264,14 @@ async def create_chat_completion(
261264
# Streaming response
262265
if request.stream:
263266
return self.chat_completion_stream_generator(
264-
request, result_generator, request_id, model_name,
265-
conversation, tokenizer, request_metadata)
267+
request,
268+
result_generator,
269+
request_id,
270+
model_name,
271+
conversation,
272+
tokenizer,
273+
request_metadata,
274+
enable_force_include_usage=self.enable_force_include_usage)
266275

267276
try:
268277
return await self.chat_completion_full_generator(
@@ -405,6 +414,7 @@ async def chat_completion_stream_generator(
405414
conversation: list[ConversationMessage],
406415
tokenizer: AnyTokenizer,
407416
request_metadata: RequestResponseMetadata,
417+
enable_force_include_usage: bool,
408418
) -> AsyncGenerator[str, None]:
409419
created_time = int(time.time())
410420
chunk_object_type: Final = "chat.completion.chunk"
@@ -471,7 +481,8 @@ async def chat_completion_stream_generator(
471481

472482
stream_options = request.stream_options
473483
if stream_options:
474-
include_usage = stream_options.include_usage
484+
include_usage = stream_options.include_usage \
485+
or enable_force_include_usage
475486
include_continuous_usage = include_usage and \
476487
stream_options.continuous_usage_stats
477488
else:

vllm/entrypoints/openai/serving_completion.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,14 @@ def __init__(
5252
*,
5353
request_logger: Optional[RequestLogger],
5454
return_tokens_as_token_ids: bool = False,
55+
enable_force_include_usage: bool = False,
5556
):
5657
super().__init__(engine_client=engine_client,
5758
model_config=model_config,
5859
models=models,
5960
request_logger=request_logger,
60-
return_tokens_as_token_ids=return_tokens_as_token_ids)
61+
return_tokens_as_token_ids=return_tokens_as_token_ids,
62+
enable_force_include_usage=enable_force_include_usage)
6163
self.default_sampling_params = (
6264
self.model_config.get_diff_sampling_param())
6365
if self.default_sampling_params:
@@ -227,7 +229,8 @@ async def create_completion(
227229
model_name,
228230
num_prompts=num_prompts,
229231
tokenizer=tokenizer,
230-
request_metadata=request_metadata)
232+
request_metadata=request_metadata,
233+
enable_force_include_usage=self.enable_force_include_usage)
231234

232235
# Non-streaming response
233236
final_res_batch: list[Optional[RequestOutput]] = [None] * num_prompts
@@ -289,6 +292,7 @@ async def completion_stream_generator(
289292
num_prompts: int,
290293
tokenizer: AnyTokenizer,
291294
request_metadata: RequestResponseMetadata,
295+
enable_force_include_usage: bool,
292296
) -> AsyncGenerator[str, None]:
293297
num_choices = 1 if request.n is None else request.n
294298
previous_text_lens = [0] * num_choices * num_prompts
@@ -298,7 +302,8 @@ async def completion_stream_generator(
298302

299303
stream_options = request.stream_options
300304
if stream_options:
301-
include_usage = stream_options.include_usage
305+
include_usage = stream_options.include_usage or \
306+
enable_force_include_usage
302307
include_continuous_usage = include_usage and \
303308
stream_options.continuous_usage_stats
304309
else:

vllm/entrypoints/openai/serving_engine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def is_embeds_prompt(prompt: RequestPrompt) -> TypeIs[EmbedsPrompt]:
132132

133133
class RequestProcessingMixin(BaseModel):
134134
"""
135-
Mixin for request processing,
135+
Mixin for request processing,
136136
handling prompt preparation and engine input.
137137
"""
138138
request_prompts: Optional[Sequence[RequestPrompt]] = []
@@ -144,7 +144,7 @@ class RequestProcessingMixin(BaseModel):
144144

145145
class ResponseGenerationMixin(BaseModel):
146146
"""
147-
Mixin for response generation,
147+
Mixin for response generation,
148148
managing result generators and final batch results.
149149
"""
150150
result_generator: Optional[AsyncGenerator[tuple[int, Union[
@@ -208,6 +208,7 @@ def __init__(
208208
*,
209209
request_logger: Optional[RequestLogger],
210210
return_tokens_as_token_ids: bool = False,
211+
enable_force_include_usage: bool = False,
211212
):
212213
super().__init__()
213214

@@ -219,6 +220,7 @@ def __init__(
219220

220221
self.request_logger = request_logger
221222
self.return_tokens_as_token_ids = return_tokens_as_token_ids
223+
self.enable_force_include_usage = enable_force_include_usage
222224

223225
self._tokenizer_executor = ThreadPoolExecutor(max_workers=1)
224226

0 commit comments

Comments
 (0)