Skip to content

Commit 6185d66

Browse files
committed
logger: Include full text in streaming response logs
Previously, the log message for a completed streaming response only included the number of generated tokens, which limited debugging and auditing capabilities. This change: - Modifies the streaming response logging to include the full concatenated text instead of just token counts - Adds test coverage to verify the full text logging behavior - Ensures all logger.info call argument indices are correct in tests The change improves the utility of logs for debugging and auditing by providing complete output records. Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
1 parent 42085ad commit 6185d66

File tree

2 files changed

+55
-14
lines changed

2 files changed

+55
-14
lines changed

tests/test_logger.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,3 +476,36 @@ def test_request_logger_log_outputs_integration():
476476
# Check output call: logger.info(format_string, request_id, stream_info, outputs, ...)
477477
assert "Generated response %s%s" in output_call[0]
478478
assert output_call[1] == "test-integration"
479+
480+
481+
def test_streaming_complete_logs_full_text_content():
482+
"""Test that streaming complete logging includes full accumulated text, not just token count."""
483+
mock_logger = MagicMock()
484+
485+
with patch("vllm.entrypoints.logger.logger", mock_logger):
486+
request_logger = RequestLogger(max_log_len=None)
487+
488+
# Test with actual content instead of token count format
489+
full_response = "This is a complete response from streaming"
490+
request_logger.log_outputs(
491+
request_id="test-streaming-full-text",
492+
outputs=full_response,
493+
output_token_ids=None,
494+
finish_reason="streaming_complete",
495+
is_streaming=True,
496+
delta=False,
497+
)
498+
499+
mock_logger.info.assert_called_once()
500+
call_args = mock_logger.info.call_args.args
501+
502+
# Verify the logged output is the full text, not a token count format
503+
logged_output = call_args[3]
504+
assert logged_output == full_response
505+
assert "tokens>" not in logged_output # Ensure it's not the old token count format
506+
assert "streaming_complete" not in logged_output # Ensure it's not the fallback format
507+
508+
# Verify other parameters
509+
assert call_args[1] == "test-streaming-full-text"
510+
assert call_args[2] == " (streaming complete)"
511+
assert call_args[5] == "streaming_complete"

vllm/entrypoints/openai/serving_chat.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -491,20 +491,21 @@ async def chat_completion_stream_generator(
491491
all_previous_token_ids: Optional[list[list[int]]]
492492
function_name_returned = [False] * num_choices
493493

494+
# Always track previous_texts for comprehensive output logging
495+
previous_texts = [""] * num_choices
496+
494497
# Only one of these will be used, thus previous_texts and
495498
# all_previous_token_ids will not be used twice in the same iteration.
496499
if tool_choice_auto or self.reasoning_parser:
497500
# These are only required in "auto" tool choice case
498-
previous_texts = [""] * num_choices
499501
all_previous_token_ids = [[]] * num_choices
500502
# For reasoning parser and tool call all enabled
501503
added_content_delta_arr = [False] * num_choices
502504
reasoning_end_arr = [False] * num_choices
503505
elif request.tool_choice == "required":
504-
previous_texts = [""] * num_choices
505506
all_previous_token_ids = None
506507
else:
507-
previous_texts, all_previous_token_ids = None, None
508+
all_previous_token_ids = None
508509

509510
try:
510511
if self.reasoning_parser:
@@ -847,6 +848,10 @@ async def chat_completion_stream_generator(
847848
assert all_previous_token_ids is not None
848849
previous_texts[i] = current_text
849850
all_previous_token_ids[i] = current_token_ids
851+
else:
852+
# Update previous_texts for comprehensive logging even in simple content case
853+
assert previous_texts is not None
854+
previous_texts[i] += delta_text
850855

851856
# set the previous values for the next iteration
852857
previous_num_tokens[i] += len(output.token_ids)
@@ -1014,17 +1019,20 @@ async def chat_completion_stream_generator(
10141019

10151020
# Log complete streaming response if output logging is enabled
10161021
if self.enable_log_outputs and self.request_logger:
1017-
# Collect all generated text from the SSE decoder if available
1018-
# For now, we'll log the completion tokens count as final output
1019-
self.request_logger.log_outputs(
1020-
request_id=request_id,
1021-
outputs=
1022-
f"<streaming_complete: {num_completion_tokens} tokens>",
1023-
output_token_ids=None,
1024-
finish_reason="streaming_complete",
1025-
is_streaming=True,
1026-
delta=False,
1027-
)
1022+
# Log the complete response for each choice
1023+
for i in range(num_choices):
1024+
full_text = (previous_texts[i] if previous_texts
1025+
and i < len(previous_texts) else
1026+
f"<streaming_complete: {previous_num_tokens[i]} tokens>"
1027+
)
1028+
self.request_logger.log_outputs(
1029+
request_id=request_id,
1030+
outputs=full_text,
1031+
output_token_ids=None, # Consider also logging all token IDs
1032+
finish_reason="streaming_complete",
1033+
is_streaming=True,
1034+
delta=False,
1035+
)
10281036

10291037
except Exception as e:
10301038
# TODO: Use a vllm-specific Validation Error

0 commit comments

Comments
 (0)