diff --git a/tests/test_logger.py b/tests/test_logger.py index 8f235f1474f..c29bc47cc9d 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -10,11 +10,12 @@ from json.decoder import JSONDecodeError from tempfile import NamedTemporaryFile from typing import Any -from unittest.mock import patch +from unittest.mock import MagicMock, patch from uuid import uuid4 import pytest +from vllm.entrypoints.logger import RequestLogger from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger, enable_trace_function_call, init_logger) from vllm.logging_utils import NewLineFormatter @@ -228,9 +229,10 @@ def test_prepare_object_to_dump(): list_obj = [1, 2, 3] assert prepare_object_to_dump(list_obj) == '[1, 2, 3]' - dict_obj = {'a': 1, 'b': 'b'} + dict_obj = {"a": 1, "b": "b"} assert prepare_object_to_dump(dict_obj) in [ - "{a: 1, b: 'b'}", "{b: 'b', a: 1}" + "{a: 1, b: 'b'}", + "{b: 'b', a: 1}", ] set_obj = {1, 2, 3} @@ -252,4 +254,247 @@ class CustomClass: b: str assert (prepare_object_to_dump(CustomClass( - 1, 'b')) == "CustomClass(a=1, b='b')") + 1, "b")) == "CustomClass(a=1, b='b')") + + +def test_request_logger_log_outputs(): + """Test the new log_outputs functionality.""" + # Create a mock logger to capture log calls + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test basic output logging + request_logger.log_outputs( + request_id="test-123", + outputs="Hello, world!", + output_token_ids=[1, 2, 3, 4], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-123" + assert call_args[3] == "Hello, world!" + assert call_args[4] == [1, 2, 3, 4] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_streaming_delta(): + """Test log_outputs with streaming delta mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming delta logging + request_logger.log_outputs( + request_id="test-456", + outputs="Hello", + output_token_ids=[1], + finish_reason=None, + is_streaming=True, + delta=True, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-456" + assert call_args[2] == " (streaming delta)" + assert call_args[3] == "Hello" + assert call_args[4] == [1] + assert call_args[5] is None + + +def test_request_logger_log_outputs_streaming_complete(): + """Test log_outputs with streaming complete mode.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test streaming complete logging + request_logger.log_outputs( + request_id="test-789", + outputs="Complete response", + output_token_ids=[1, 2, 3], + finish_reason="length", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-789" + assert call_args[2] == " (streaming complete)" + assert call_args[3] == "Complete response" + assert call_args[4] == [1, 2, 3] + assert call_args[5] == "length" + + +def test_request_logger_log_outputs_with_truncation(): + """Test log_outputs respects max_log_len setting.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + # Set max_log_len to 10 + request_logger = RequestLogger(max_log_len=10) + + # Test output truncation + long_output = "This is a very long output that should be truncated" + long_token_ids = list(range(20)) # 20 tokens + + request_logger.log_outputs( + request_id="test-truncate", + outputs=long_output, + output_token_ids=long_token_ids, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args + + # Check that output was truncated to first 10 characters + logged_output = call_args[0][3] + assert logged_output == "This is a " + assert len(logged_output) == 10 + + # Check that token IDs were truncated to first 10 tokens + logged_token_ids = call_args[0][4] + assert logged_token_ids == list(range(10)) + assert len(logged_token_ids) == 10 + + +def test_request_logger_log_outputs_none_values(): + """Test log_outputs handles None values correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with None output_token_ids + request_logger.log_outputs( + request_id="test-none", + outputs="Test output", + output_token_ids=None, + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-none" + assert call_args[3] == "Test output" + assert call_args[4] is None + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_empty_output(): + """Test log_outputs handles empty output correctly.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=5) + + # Test with empty output + request_logger.log_outputs( + request_id="test-empty", + outputs="", + output_token_ids=[], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + assert "Generated response %s%s" in call_args[0] + assert call_args[1] == "test-empty" + assert call_args[3] == "" + assert call_args[4] == [] + assert call_args[5] == "stop" + + +def test_request_logger_log_outputs_integration(): + """Test that log_outputs can be called alongside log_inputs.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test that both methods can be called without interference + request_logger.log_inputs( + request_id="test-integration", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_embeds=None, + params=None, + lora_request=None, + prompt_adapter_request=None, + ) + + request_logger.log_outputs( + request_id="test-integration", + outputs="Test output", + output_token_ids=[4, 5, 6], + finish_reason="stop", + is_streaming=False, + delta=False, + ) + + # Should have been called twice - once for inputs, once for outputs + assert mock_logger.info.call_count == 2 + + # Check that the calls were made with correct patterns + input_call = mock_logger.info.call_args_list[0][0] + output_call = mock_logger.info.call_args_list[1][0] + + assert "Received request %s" in input_call[0] + assert input_call[1] == "test-integration" + + assert "Generated response %s%s" in output_call[0] + assert output_call[1] == "test-integration" + + +def test_streaming_complete_logs_full_text_content(): + """Test that streaming complete logging includes + full accumulated text, not just token count.""" + mock_logger = MagicMock() + + with patch("vllm.entrypoints.logger.logger", mock_logger): + request_logger = RequestLogger(max_log_len=None) + + # Test with actual content instead of token count format + full_response = "This is a complete response from streaming" + request_logger.log_outputs( + request_id="test-streaming-full-text", + outputs=full_response, + output_token_ids=None, + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) + + mock_logger.info.assert_called_once() + call_args = mock_logger.info.call_args.args + + # Verify the logged output is the full text, not a token count format + logged_output = call_args[3] + assert logged_output == full_response + assert "tokens>" not in logged_output + assert "streaming_complete" not in logged_output + + # Verify other parameters + assert call_args[1] == "test-streaming-full-text" + assert call_args[2] == " (streaming complete)" + assert call_args[5] == "streaming_complete" diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index f3aee188dae..d90163d051d 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Sequence from typing import Optional, Union import torch @@ -48,3 +49,36 @@ def log_inputs( prompt, params, prompt_token_ids, prompt_embeds.shape if prompt_embeds is not None else None, lora_request, prompt_adapter_request) + + def log_outputs( + self, + request_id: str, + outputs: str, + output_token_ids: Optional[Sequence[int]], + finish_reason: Optional[str] = None, + is_streaming: bool = False, + delta: bool = False, + ) -> None: + max_log_len = self.max_log_len + if max_log_len is not None: + if outputs is not None: + outputs = outputs[:max_log_len] + + if output_token_ids is not None: + # Convert to list and apply truncation + output_token_ids = list(output_token_ids)[:max_log_len] + + stream_info = "" + if is_streaming: + stream_info = (" (streaming delta)" + if delta else " (streaming complete)") + + logger.info( + "Generated response %s%s: output: %r, " + "output_token_ids: %s, finish_reason: %s", + request_id, + stream_info, + outputs, + output_token_ids, + finish_reason, + ) diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4f8aaab772f..1651639917f 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -41,10 +41,10 @@ def __call__( lora_list: list[LoRAModulePath] = [] for item in values: - if item in [None, '']: # Skip if item is None or empty string + if item in [None, ""]: # Skip if item is None or empty string continue - if '=' in item and ',' not in item: # Old format: name=path - name, path = item.split('=') + if "=" in item and "," not in item: # Old format: name=path + name, path = item.split("=") lora_list.append(LoRAModulePath(name, path)) else: # Assume JSON format try: @@ -77,7 +77,7 @@ def __call__( adapter_list: list[PromptAdapterPath] = [] for item in values: - name, path = item.split('=') + name, path = item.split("=") adapter_list.append(PromptAdapterPath(name, path)) setattr(namespace, self.dest, adapter_list) @@ -95,10 +95,10 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: choices=['debug', 'info', 'warning', 'error', 'critical', 'trace'], help="Log level for uvicorn.") parser.add_argument("--disable-uvicorn-access-log", - action="store_true", + action='store_true', help="Disable uvicorn access log.") parser.add_argument("--allow-credentials", - action="store_true", + action='store_true', help="Allow credentials.") parser.add_argument("--allowed-origins", type=json.loads, @@ -136,15 +136,18 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: nargs='+', action=PromptAdapterParserAction, help="Prompt adapter configurations in the format name=path. " - "Multiple adapters can be specified.") - parser.add_argument("--chat-template", - type=optional_type(str), - default=None, - help="The file path to the chat template, " - "or the template in single-line form " - "for the specified model.") + "Multiple adapters can be specified.", + ) + parser.add_argument( + "--chat-template", + type=optional_type(str), + default=None, + help="The file path to the chat template, " + "or the template in single-line form " + "for the specified model.", + ) parser.add_argument( - '--chat-template-content-format', + "--chat-template-content-format", type=str, default="auto", choices=get_args(ChatTemplateContentFormatOption), @@ -153,7 +156,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: '* "string" will render the content as a string. ' 'Example: ``"Hello World"``\n' '* "openai" will render the content as a list of dictionaries, ' - 'similar to OpenAI schema. ' + "similar to OpenAI schema. " 'Example: ``[{"type": "text", "text": "Hello world!"}]``') parser.add_argument("--response-role", type=optional_type(str), @@ -174,7 +177,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="The CA certificates file.") parser.add_argument( "--enable-ssl-refresh", - action="store_true", + action='store_true', default=False, help="Refresh SSL Context when SSL certificate files change") parser.add_argument( @@ -203,29 +206,29 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "using ``app.add_middleware()``. ") parser.add_argument( "--return-tokens-as-token-ids", - action="store_true", + action='store_true', help="When ``--max-logprobs`` is specified, represents single tokens " " as strings of the form 'token_id:{token_id}' so that tokens " "that are not JSON-encodable can be identified.") parser.add_argument( "--disable-frontend-multiprocessing", - action="store_true", + action='store_true', help="If specified, will run the OpenAI frontend server in the same " "process as the model serving engine.") parser.add_argument( "--enable-request-id-headers", - action="store_true", + action='store_true', help="If specified, API server will add X-Request-Id header to " "responses.") parser.add_argument( "--enable-auto-tool-choice", - action="store_true", + action='store_true', default=False, help="Enable auto tool choice for supported models. Use " "``--tool-call-parser`` to specify which parser to use.") parser.add_argument( "--expand-tools-even-if-tool-choice-none", - action="store_true", + action='store_true', default=False, deprecated=True, help="Include tool definitions in prompts " @@ -260,8 +263,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--log-config-file", type=str, default=envs.VLLM_LOGGING_CONFIG_PATH, - help="Path to logging config JSON file for both vllm and uvicorn", - ) + help="Path to logging config JSON file for both vllm and uvicorn") parser = AsyncEngineArgs.add_cli_args(parser) @@ -287,7 +289,8 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: "--enable-force-include-usage", action='store_true', default=False, - help="If set to True, including usage on every request.") + help="If set to True, including usage on every request.", + ) parser.add_argument( "--enable-server-load-tracking", action='store_true', @@ -295,6 +298,13 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help= "If set to True, enable tracking server_load_metrics in the app state." ) + parser.add_argument( + "--enable-log-outputs", + action='store_true', + default=False, + help="If set to True, enable logging of model outputs (generations) " + "in addition to the input logging that is enabled by default.", + ) return parser diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 53509e8f65a..9bfae5f9586 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -67,6 +67,7 @@ def __init__( tool_parser: Optional[str] = None, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, + enable_log_outputs: bool = False, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, @@ -78,6 +79,7 @@ def __init__( self.response_role = response_role self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format + self.enable_log_outputs = enable_log_outputs # set up tool use self.enable_auto_tools: bool = enable_auto_tools @@ -467,20 +469,21 @@ async def chat_completion_stream_generator( all_previous_token_ids: Optional[list[list[int]]] function_name_returned = [False] * num_choices + # Always track previous_texts for comprehensive output logging + previous_texts = [""] * num_choices + # Only one of these will be used, thus previous_texts and # all_previous_token_ids will not be used twice in the same iteration. if tool_choice_auto or self.reasoning_parser: # These are only required in "auto" tool choice case - previous_texts = [""] * num_choices all_previous_token_ids = [[]] * num_choices # For reasoning parser and tool call all enabled added_content_delta_arr = [False] * num_choices reasoning_end_arr = [False] * num_choices elif request.tool_choice == "required": - previous_texts = [""] * num_choices all_previous_token_ids = None else: - previous_texts, all_previous_token_ids = None, None + all_previous_token_ids = None try: if self.reasoning_parser: @@ -792,6 +795,7 @@ async def chat_completion_stream_generator( current_token_ids=current_token_ids, delta_token_ids=output.token_ids, request=request)) + # when only reasoning elif self.reasoning_parser: delta_message = (reasoning_parser. @@ -813,6 +817,10 @@ async def chat_completion_stream_generator( assert all_previous_token_ids is not None previous_texts[i] = current_text all_previous_token_ids[i] = current_token_ids + else: + # Update for comprehensive logging even in simple case + assert previous_texts is not None + previous_texts[i] += delta_text # set the previous values for the next iteration previous_num_tokens[i] += len(output.token_ids) @@ -824,6 +832,27 @@ async def chat_completion_stream_generator( if delta_message is None: continue + # Log streaming delta if output logging is enabled + if self.enable_log_outputs and self.request_logger: + delta_content = "" + if delta_message.content: + delta_content = delta_message.content + elif delta_message.tool_calls: + delta_content = "".join( + tc.function.arguments + for tc in delta_message.tool_calls + if tc.function and tc.function.arguments) + + if delta_content: + self.request_logger.log_outputs( + request_id=request_id, + outputs=delta_content, + output_token_ids=list(output.token_ids), + finish_reason=output.finish_reason, + is_streaming=True, + delta=True, + ) + if output.finish_reason is None: # Send token-by-token response for each request.n choice_data = ChatCompletionResponseStreamChoice( @@ -942,7 +971,27 @@ async def chat_completion_stream_generator( request_metadata.final_usage_info = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=num_completion_tokens, - total_tokens=num_prompt_tokens + num_completion_tokens) + total_tokens=num_prompt_tokens + num_completion_tokens, + ) + + # Log complete streaming response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + # Log the complete response for each choice + for i in range(num_choices): + full_text = ( + previous_texts[i] + if previous_texts and i < len(previous_texts) else + f"" + ) + self.request_logger.log_outputs( + request_id=request_id, + outputs=full_text, + output_token_ids= + None, # Consider also logging all token IDs + finish_reason="streaming_complete", + is_streaming=True, + delta=False, + ) except Exception as e: # TODO: Use a vllm-specific Validation Error @@ -1035,8 +1084,10 @@ async def chat_completion_full_generator( tool_calls=[ tool_call_class(function=FunctionCall( name=request.tool_choice.function.name, - arguments=content)) - ]) + arguments=content, + )) + ], + ) elif request.tool_choice and request.tool_choice == "required": tool_call_class = MistralToolCall if isinstance( @@ -1115,12 +1166,13 @@ async def chat_completion_full_generator( finish_reason="tool_calls" if auto_tools_called else output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason) + choices.append(choice_data) if request.echo: last_msg_content: Union[str, list[dict[str, str]]] = "" - if conversation and "content" in conversation[-1] and conversation[ - -1].get("role") == role: + if (conversation and "content" in conversation[-1] + and conversation[-1].get("role") == role): last_msg_content = conversation[-1]["content"] or "" if isinstance(last_msg_content, list): last_msg_content = "\n".join(msg['text'] @@ -1157,6 +1209,40 @@ async def chat_completion_full_generator( kv_transfer_params=final_res.kv_transfer_params, ) + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + for choice in choices: + output_text = "" + if choice.message.content: + output_text = choice.message.content + elif choice.message.tool_calls: + # For tool calls, log the function name and arguments + tool_call_descriptions = [] + for tool_call in choice.message.tool_calls: + if hasattr(tool_call.function, "name") and hasattr( + tool_call.function, "arguments"): + tool_call_descriptions.append( + f"{tool_call.function.name}({tool_call.function.arguments})" + ) + tool_calls_str = ", ".join(tool_call_descriptions) + output_text = f"[tool_calls: {tool_calls_str}]" + + if output_text: + # Get the corresponding output token IDs + output_token_ids = None + if choice.index < len(final_res.outputs): + output_token_ids = final_res.outputs[ + choice.index].token_ids + + self.request_logger.log_outputs( + request_id=request_id, + outputs=output_text, + output_token_ids=output_token_ids, + finish_reason=choice.finish_reason, + is_streaming=False, + delta=False, + ) + return response def _get_top_logprobs( @@ -1164,15 +1250,16 @@ def _get_top_logprobs( tokenizer: AnyTokenizer, should_return_as_token_id: bool) -> list[ChatCompletionLogProb]: return [ - ChatCompletionLogProb(token=(token := self._get_decoded_token( - p[1], - p[0], - tokenizer, - return_as_token_id=should_return_as_token_id)), - logprob=max(p[1].logprob, -9999.0), - bytes=list( - token.encode("utf-8", errors="replace"))) - for i, p in enumerate(logprobs.items()) + ChatCompletionLogProb( + token=(token := self._get_decoded_token( + p[1], + p[0], + tokenizer, + return_as_token_id=should_return_as_token_id, + )), + logprob=max(p[1].logprob, -9999.0), + bytes=list(token.encode("utf-8", errors="replace")), + ) for i, p in enumerate(logprobs.items()) if top_logprobs and i < top_logprobs ] diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index ac2b3dfafec..0096c37280a 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -55,6 +55,7 @@ def __init__( tool_parser: Optional[str] = None, enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, + enable_log_outputs: bool = False, ) -> None: super().__init__( engine_client=engine_client, @@ -67,6 +68,7 @@ def __init__( self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format + self.enable_log_outputs = enable_log_outputs self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = None @@ -335,6 +337,24 @@ async def responses_full_generator( usage=usage, ) + # Log complete response if output logging is enabled + if self.enable_log_outputs and self.request_logger: + output_text = "" + if content: + output_text = content + elif reasoning_content: + output_text = f"[reasoning: {reasoning_content}]" + + if output_text: + self.request_logger.log_outputs( + request_id=request.request_id, + outputs=output_text, + output_token_ids=final_output.token_ids, + finish_reason=final_output.finish_reason, + is_streaming=False, + delta=False, + ) + if request.store: async with self.response_store_lock: stored_response = self.response_store.get(response.id)