Skip to content

Commit 94ff415

Browse files
committed
feat: Add --enable-log-outputs flag for logging model generations
Add optional output logging functionality to complement existing input logging. By default, vLLM only logs incoming requests but not model outputs. This feature adds comprehensive output logging controlled by a new CLI flag. Key features: - New --enable-log-outputs CLI flag (disabled by default) - Logs both streaming and non-streaming responses - Supports individual token deltas in streaming mode - Handles tool calls and function arguments - Respects existing --max-log-len truncation settings - Maintains full backward compatibility Implementation: - Added RequestLogger.log_outputs() method for output logging - Enhanced OpenAIServingChat with output logging in both generators - Enhanced OpenAIServingResponses with output logging support - Added comprehensive test coverage for all scenarios Usage: python -m vllm.entrypoints.openai.api_server --model MODEL_NAME --enable-log-outputs Docker: docker run --gpus all -p 8000:8000 vllm/vllm-openai:latest --model MODEL_NAME --enable-log-outputs This addresses the common need for debugging and monitoring model outputs while preserving the existing behavior by default. Signed-off-by: Adrian Garcia <mizadri.gg@gmail.com>
1 parent 332d4cb commit 94ff415

File tree

6 files changed

+319
-1
lines changed

6 files changed

+319
-1
lines changed

tests/test_logger.py

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
from json.decoder import JSONDecodeError
1111
from tempfile import NamedTemporaryFile
1212
from typing import Any
13-
from unittest.mock import patch
13+
from unittest.mock import patch, MagicMock
1414
from uuid import uuid4
1515

1616
import pytest
1717

18+
from vllm.entrypoints.logger import RequestLogger
1819
from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
1920
enable_trace_function_call, init_logger)
2021
from vllm.logging_utils import NewLineFormatter
@@ -253,3 +254,202 @@ class CustomClass:
253254

254255
assert (prepare_object_to_dump(CustomClass(
255256
1, 'b')) == "CustomClass(a=1, b='b')")
257+
258+
259+
def test_request_logger_log_outputs():
260+
"""Test the new log_outputs functionality."""
261+
# Create a mock logger to capture log calls
262+
mock_logger = MagicMock()
263+
264+
with patch('vllm.entrypoints.logger.logger', mock_logger):
265+
request_logger = RequestLogger(max_log_len=None)
266+
267+
# Test basic output logging
268+
request_logger.log_outputs(
269+
request_id="test-123",
270+
outputs="Hello, world!",
271+
output_token_ids=[1, 2, 3, 4],
272+
finish_reason="stop",
273+
is_streaming=False,
274+
delta=False
275+
)
276+
277+
mock_logger.info.assert_called_once()
278+
call_args = mock_logger.info.call_args[0]
279+
assert "Generated response test-123" in call_args[0]
280+
assert "Hello, world!" in call_args[1]
281+
assert [1, 2, 3, 4] == call_args[2]
282+
assert "stop" == call_args[3]
283+
284+
285+
def test_request_logger_log_outputs_streaming_delta():
286+
"""Test log_outputs with streaming delta mode."""
287+
mock_logger = MagicMock()
288+
289+
with patch('vllm.entrypoints.logger.logger', mock_logger):
290+
request_logger = RequestLogger(max_log_len=None)
291+
292+
# Test streaming delta logging
293+
request_logger.log_outputs(
294+
request_id="test-456",
295+
outputs="Hello",
296+
output_token_ids=[1],
297+
finish_reason=None,
298+
is_streaming=True,
299+
delta=True
300+
)
301+
302+
mock_logger.info.assert_called_once()
303+
call_args = mock_logger.info.call_args[0]
304+
assert "Generated response test-456 (streaming delta)" in call_args[0]
305+
assert "Hello" == call_args[1]
306+
assert [1] == call_args[2]
307+
assert call_args[3] is None
308+
309+
310+
def test_request_logger_log_outputs_streaming_complete():
311+
"""Test log_outputs with streaming complete mode."""
312+
mock_logger = MagicMock()
313+
314+
with patch('vllm.entrypoints.logger.logger', mock_logger):
315+
request_logger = RequestLogger(max_log_len=None)
316+
317+
# Test streaming complete logging
318+
request_logger.log_outputs(
319+
request_id="test-789",
320+
outputs="Complete response",
321+
output_token_ids=[1, 2, 3],
322+
finish_reason="length",
323+
is_streaming=True,
324+
delta=False
325+
)
326+
327+
mock_logger.info.assert_called_once()
328+
call_args = mock_logger.info.call_args[0]
329+
assert "Generated response test-789 (streaming complete)" in call_args[0]
330+
assert "Complete response" == call_args[1]
331+
assert [1, 2, 3] == call_args[2]
332+
assert "length" == call_args[3]
333+
334+
335+
def test_request_logger_log_outputs_with_truncation():
336+
"""Test log_outputs respects max_log_len setting."""
337+
mock_logger = MagicMock()
338+
339+
with patch('vllm.entrypoints.logger.logger', mock_logger):
340+
# Set max_log_len to 10
341+
request_logger = RequestLogger(max_log_len=10)
342+
343+
# Test output truncation
344+
long_output = "This is a very long output that should be truncated"
345+
long_token_ids = list(range(20)) # 20 tokens
346+
347+
request_logger.log_outputs(
348+
request_id="test-truncate",
349+
outputs=long_output,
350+
output_token_ids=long_token_ids,
351+
finish_reason="stop",
352+
is_streaming=False,
353+
delta=False
354+
)
355+
356+
mock_logger.info.assert_called_once()
357+
call_args = mock_logger.info.call_args
358+
359+
# Check that output was truncated to first 10 characters
360+
logged_output = call_args[0][1]
361+
assert logged_output == "This is a "
362+
assert len(logged_output) == 10
363+
364+
# Check that token IDs were truncated to first 10 tokens
365+
logged_token_ids = call_args[0][2]
366+
assert logged_token_ids == list(range(10))
367+
assert len(logged_token_ids) == 10
368+
369+
370+
def test_request_logger_log_outputs_none_values():
371+
"""Test log_outputs handles None values correctly."""
372+
mock_logger = MagicMock()
373+
374+
with patch('vllm.entrypoints.logger.logger', mock_logger):
375+
request_logger = RequestLogger(max_log_len=None)
376+
377+
# Test with None output_token_ids
378+
request_logger.log_outputs(
379+
request_id="test-none",
380+
outputs="Test output",
381+
output_token_ids=None,
382+
finish_reason="stop",
383+
is_streaming=False,
384+
delta=False
385+
)
386+
387+
mock_logger.info.assert_called_once()
388+
call_args = mock_logger.info.call_args[0]
389+
assert "Generated response test-none" in call_args[0]
390+
assert "Test output" == call_args[1]
391+
assert call_args[2] is None
392+
assert "stop" == call_args[3]
393+
394+
395+
def test_request_logger_log_outputs_empty_output():
396+
"""Test log_outputs handles empty output correctly."""
397+
mock_logger = MagicMock()
398+
399+
with patch('vllm.entrypoints.logger.logger', mock_logger):
400+
request_logger = RequestLogger(max_log_len=5)
401+
402+
# Test with empty output
403+
request_logger.log_outputs(
404+
request_id="test-empty",
405+
outputs="",
406+
output_token_ids=[],
407+
finish_reason="stop",
408+
is_streaming=False,
409+
delta=False
410+
)
411+
412+
mock_logger.info.assert_called_once()
413+
call_args = mock_logger.info.call_args[0]
414+
assert "Generated response test-empty" in call_args[0]
415+
assert "" == call_args[1]
416+
assert [] == call_args[2]
417+
assert "stop" == call_args[3]
418+
419+
420+
def test_request_logger_log_outputs_integration():
421+
"""Test that log_outputs can be called alongside log_inputs."""
422+
mock_logger = MagicMock()
423+
424+
with patch('vllm.entrypoints.logger.logger', mock_logger):
425+
request_logger = RequestLogger(max_log_len=None)
426+
427+
# Test that both methods can be called without interference
428+
request_logger.log_inputs(
429+
request_id="test-integration",
430+
prompt="Test prompt",
431+
prompt_token_ids=[1, 2, 3],
432+
prompt_embeds=None,
433+
params=None,
434+
lora_request=None,
435+
prompt_adapter_request=None
436+
)
437+
438+
request_logger.log_outputs(
439+
request_id="test-integration",
440+
outputs="Test output",
441+
output_token_ids=[4, 5, 6],
442+
finish_reason="stop",
443+
is_streaming=False,
444+
delta=False
445+
)
446+
447+
# Should have been called twice - once for inputs, once for outputs
448+
assert mock_logger.info.call_count == 2
449+
450+
# Check that the calls were made with correct patterns
451+
input_call = mock_logger.info.call_args_list[0][0]
452+
output_call = mock_logger.info.call_args_list[1][0]
453+
454+
assert "Received request test-integration" in input_call[0]
455+
assert "Generated response test-integration" in output_call[0]

vllm/entrypoints/logger.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,29 @@ def log_inputs(
4848
prompt, params, prompt_token_ids,
4949
prompt_embeds.shape if prompt_embeds is not None else None,
5050
lora_request, prompt_adapter_request)
51+
52+
def log_outputs(
53+
self,
54+
request_id: str,
55+
outputs: str,
56+
output_token_ids: Optional[list[int]],
57+
finish_reason: Optional[str] = None,
58+
is_streaming: bool = False,
59+
delta: bool = False,
60+
) -> None:
61+
max_log_len = self.max_log_len
62+
if max_log_len is not None:
63+
if outputs is not None:
64+
outputs = outputs[:max_log_len]
65+
66+
if output_token_ids is not None:
67+
output_token_ids = output_token_ids[:max_log_len]
68+
69+
stream_info = ""
70+
if is_streaming:
71+
stream_info = " (streaming delta)" if delta else " (streaming complete)"
72+
73+
logger.info(
74+
"Generated response %s%s: output: %r, "
75+
"output_token_ids: %s, finish_reason: %s",
76+
request_id, stream_info, outputs, output_token_ids, finish_reason)

vllm/entrypoints/openai/api_server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1504,6 +1504,7 @@ async def init_app_state(
15041504
reasoning_parser=args.reasoning_parser,
15051505
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
15061506
enable_force_include_usage=args.enable_force_include_usage,
1507+
enable_log_outputs=args.enable_log_outputs,
15071508
) if model_config.runner_type == "generate" else None
15081509
state.openai_serving_chat = OpenAIServingChat(
15091510
engine_client,
@@ -1521,6 +1522,7 @@ async def init_app_state(
15211522
reasoning_parser=args.reasoning_parser,
15221523
enable_prompt_tokens_details=args.enable_prompt_tokens_details,
15231524
enable_force_include_usage=args.enable_force_include_usage,
1525+
enable_log_outputs=args.enable_log_outputs,
15241526
) if model_config.runner_type == "generate" else None
15251527
state.openai_serving_completion = OpenAIServingCompletion(
15261528
engine_client,

vllm/entrypoints/openai/cli_args.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,14 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
295295
help=
296296
"If set to True, enable tracking server_load_metrics in the app state."
297297
)
298+
parser.add_argument(
299+
"--enable-log-outputs",
300+
action='store_true',
301+
default=False,
302+
help=
303+
"If set to True, enable logging of model outputs (generations) "
304+
"in addition to the input logging that is enabled by default."
305+
)
298306

299307
return parser
300308

vllm/entrypoints/openai/serving_chat.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ def __init__(
6767
tool_parser: Optional[str] = None,
6868
enable_prompt_tokens_details: bool = False,
6969
enable_force_include_usage: bool = False,
70+
enable_log_outputs: bool = False,
7071
) -> None:
7172
super().__init__(engine_client=engine_client,
7273
model_config=model_config,
@@ -78,6 +79,7 @@ def __init__(
7879
self.response_role = response_role
7980
self.chat_template = chat_template
8081
self.chat_template_content_format: Final = chat_template_content_format
82+
self.enable_log_outputs = enable_log_outputs
8183

8284
# set up tool use
8385
self.enable_auto_tools: bool = enable_auto_tools
@@ -823,6 +825,24 @@ async def chat_completion_stream_generator(
823825
if delta_message is None:
824826
continue
825827

828+
# Log individual streaming delta if output logging is enabled
829+
if self.enable_log_outputs and self.request_logger:
830+
delta_content = ""
831+
if delta_message.content:
832+
delta_content = delta_message.content
833+
elif delta_message.tool_calls and delta_message.tool_calls[0].function and delta_message.tool_calls[0].function.arguments:
834+
delta_content = delta_message.tool_calls[0].function.arguments
835+
836+
if delta_content:
837+
self.request_logger.log_outputs(
838+
request_id=request_id,
839+
outputs=delta_content,
840+
output_token_ids=list(output.token_ids),
841+
finish_reason=output.finish_reason,
842+
is_streaming=True,
843+
delta=True,
844+
)
845+
826846
if output.finish_reason is None:
827847
# Send token-by-token response for each request.n
828848
choice_data = ChatCompletionResponseStreamChoice(
@@ -943,6 +963,19 @@ async def chat_completion_stream_generator(
943963
completion_tokens=num_completion_tokens,
944964
total_tokens=num_prompt_tokens + num_completion_tokens)
945965

966+
# Log complete streaming response if output logging is enabled
967+
if self.enable_log_outputs and self.request_logger:
968+
# Collect all generated text from the SSE decoder if available
969+
# For now, we'll log the completion tokens count as final output
970+
self.request_logger.log_outputs(
971+
request_id=request_id,
972+
outputs=f"<streaming_complete: {num_completion_tokens} tokens>",
973+
output_token_ids=None,
974+
finish_reason="streaming_complete",
975+
is_streaming=True,
976+
delta=False,
977+
)
978+
946979
except Exception as e:
947980
# TODO: Use a vllm-specific Validation Error
948981
logger.exception("Error in chat completion stream generator.")
@@ -1156,6 +1189,35 @@ async def chat_completion_full_generator(
11561189
kv_transfer_params=final_res.kv_transfer_params,
11571190
)
11581191

1192+
# Log complete response if output logging is enabled
1193+
if self.enable_log_outputs and self.request_logger:
1194+
for choice in choices:
1195+
output_text = ""
1196+
if choice.message.content:
1197+
output_text = choice.message.content
1198+
elif choice.message.tool_calls:
1199+
# For tool calls, log the function name and arguments
1200+
tool_call_info = []
1201+
for tool_call in choice.message.tool_calls:
1202+
if hasattr(tool_call.function, 'name') and hasattr(tool_call.function, 'arguments'):
1203+
tool_call_info.append(f"{tool_call.function.name}({tool_call.function.arguments})")
1204+
output_text = f"[tool_calls: {', '.join(tool_call_info)}]"
1205+
1206+
if output_text:
1207+
# Get the corresponding output token IDs
1208+
output_token_ids = None
1209+
if choice.index < len(final_res.outputs):
1210+
output_token_ids = final_res.outputs[choice.index].token_ids
1211+
1212+
self.request_logger.log_outputs(
1213+
request_id=request_id,
1214+
outputs=output_text,
1215+
output_token_ids=output_token_ids,
1216+
finish_reason=choice.finish_reason,
1217+
is_streaming=False,
1218+
delta=False,
1219+
)
1220+
11591221
return response
11601222

11611223
def _get_top_logprobs(

0 commit comments

Comments
 (0)