logger: Include full text in streaming response logs

mizadri · mizadri · commit 6185d666652d · 2025-07-12T00:14:53.000+04:00
Previously, the log message for a completed streaming response only included
the number of generated tokens, which limited debugging and auditing
capabilities. This change:

- Modifies the streaming response logging to include the full concatenated
  text instead of just token counts
- Adds test coverage to verify the full text logging behavior
- Ensures all logger.info call argument indices are correct in tests

The change improves the utility of logs for debugging and auditing by
providing complete output records.

Signed-off-by: Adrian Garcia &lt;adrian.garcia@inceptionai.ai&gt;
diff --git a/tests/test_logger.py b/tests/test_logger.py
@@ -476,3 +476,36 @@ def test_request_logger_log_outputs_integration():
         # Check output call: logger.info(format_string, request_id, stream_info, outputs, ...)
         assert "Generated response %s%s" in output_call[0]
         assert output_call[1] == "test-integration"
+
+
+def test_streaming_complete_logs_full_text_content():
+    """Test that streaming complete logging includes full accumulated text, not just token count."""
+    mock_logger = MagicMock()
+
+    with patch("vllm.entrypoints.logger.logger", mock_logger):
+        request_logger = RequestLogger(max_log_len=None)
+
+        # Test with actual content instead of token count format
+        full_response = "This is a complete response from streaming"
+        request_logger.log_outputs(
+            request_id="test-streaming-full-text",
+            outputs=full_response,
+            output_token_ids=None,
+            finish_reason="streaming_complete",
+            is_streaming=True,
+            delta=False,
+        )
+
+        mock_logger.info.assert_called_once()
+        call_args = mock_logger.info.call_args.args
+        
+        # Verify the logged output is the full text, not a token count format
+        logged_output = call_args[3]
+        assert logged_output == full_response
+        assert "tokens>" not in logged_output  # Ensure it's not the old token count format
+        assert "streaming_complete" not in logged_output  # Ensure it's not the fallback format
+        
+        # Verify other parameters
+        assert call_args[1] == "test-streaming-full-text"
+        assert call_args[2] == " (streaming complete)"
+        assert call_args[5] == "streaming_complete"
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -491,20 +491,21 @@ async def chat_completion_stream_generator(
         all_previous_token_ids: Optional[list[list[int]]]
         function_name_returned = [False] * num_choices
 
+        # Always track previous_texts for comprehensive output logging
+        previous_texts = [""] * num_choices
+        
         # Only one of these will be used, thus previous_texts and
         # all_previous_token_ids will not be used twice in the same iteration.
         if tool_choice_auto or self.reasoning_parser:
             # These are only required in "auto" tool choice case
-            previous_texts = [""] * num_choices
             all_previous_token_ids = [[]] * num_choices
             # For reasoning parser and tool call all enabled
             added_content_delta_arr = [False] * num_choices
             reasoning_end_arr = [False] * num_choices
         elif request.tool_choice == "required":
-            previous_texts = [""] * num_choices
             all_previous_token_ids = None
         else:
-            previous_texts, all_previous_token_ids = None, None
+            all_previous_token_ids = None
 
         try:
             if self.reasoning_parser:
@@ -847,6 +848,10 @@ async def chat_completion_stream_generator(
                         assert all_previous_token_ids is not None
                         previous_texts[i] = current_text
                         all_previous_token_ids[i] = current_token_ids
+                    else:
+                        # Update previous_texts for comprehensive logging even in simple content case
+                        assert previous_texts is not None
+                        previous_texts[i] += delta_text
 
                     # set the previous values for the next iteration
                     previous_num_tokens[i] += len(output.token_ids)
@@ -1014,17 +1019,20 @@ async def chat_completion_stream_generator(
 
             # Log complete streaming response if output logging is enabled
             if self.enable_log_outputs and self.request_logger:
-                # Collect all generated text from the SSE decoder if available
-                # For now, we'll log the completion tokens count as final output
-                self.request_logger.log_outputs(
-                    request_id=request_id,
-                    outputs=
-                    f"<streaming_complete: {num_completion_tokens} tokens>",
-                    output_token_ids=None,
-                    finish_reason="streaming_complete",
-                    is_streaming=True,
-                    delta=False,
-                )
+                # Log the complete response for each choice
+                for i in range(num_choices):
+                    full_text = (previous_texts[i] if previous_texts
+                                 and i < len(previous_texts) else
+                                 f"<streaming_complete: {previous_num_tokens[i]} tokens>"
+                                 )
+                    self.request_logger.log_outputs(
+                        request_id=request_id,
+                        outputs=full_text,
+                        output_token_ids=None,  # Consider also logging all token IDs
+                        finish_reason="streaming_complete",
+                        is_streaming=True,
+                        delta=False,
+                    )
 
         except Exception as e:
             # TODO: Use a vllm-specific Validation Error