Skip to content

feat: Add --enable-log-outputs flag for logging model generations #20707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
253 changes: 249 additions & 4 deletions tests/test_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
from json.decoder import JSONDecodeError
from tempfile import NamedTemporaryFile
from typing import Any
from unittest.mock import patch
from unittest.mock import MagicMock, patch
from uuid import uuid4

import pytest

from vllm.entrypoints.logger import RequestLogger
from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
enable_trace_function_call, init_logger)
from vllm.logging_utils import NewLineFormatter
Expand Down Expand Up @@ -228,9 +229,10 @@ def test_prepare_object_to_dump():
list_obj = [1, 2, 3]
assert prepare_object_to_dump(list_obj) == '[1, 2, 3]'

dict_obj = {'a': 1, 'b': 'b'}
dict_obj = {"a": 1, "b": "b"}
assert prepare_object_to_dump(dict_obj) in [
"{a: 1, b: 'b'}", "{b: 'b', a: 1}"
"{a: 1, b: 'b'}",
"{b: 'b', a: 1}",
]

set_obj = {1, 2, 3}
Expand All @@ -252,4 +254,247 @@ class CustomClass:
b: str

assert (prepare_object_to_dump(CustomClass(
1, 'b')) == "CustomClass(a=1, b='b')")
1, "b")) == "CustomClass(a=1, b='b')")


def test_request_logger_log_outputs():
"""Test the new log_outputs functionality."""
# Create a mock logger to capture log calls
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)

# Test basic output logging
request_logger.log_outputs(
request_id="test-123",
outputs="Hello, world!",
output_token_ids=[1, 2, 3, 4],
finish_reason="stop",
is_streaming=False,
delta=False,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-123"
assert call_args[3] == "Hello, world!"
assert call_args[4] == [1, 2, 3, 4]
assert call_args[5] == "stop"


def test_request_logger_log_outputs_streaming_delta():
"""Test log_outputs with streaming delta mode."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)

# Test streaming delta logging
request_logger.log_outputs(
request_id="test-456",
outputs="Hello",
output_token_ids=[1],
finish_reason=None,
is_streaming=True,
delta=True,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-456"
assert call_args[2] == " (streaming delta)"
assert call_args[3] == "Hello"
assert call_args[4] == [1]
assert call_args[5] is None


def test_request_logger_log_outputs_streaming_complete():
"""Test log_outputs with streaming complete mode."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)

# Test streaming complete logging
request_logger.log_outputs(
request_id="test-789",
outputs="Complete response",
output_token_ids=[1, 2, 3],
finish_reason="length",
is_streaming=True,
delta=False,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-789"
assert call_args[2] == " (streaming complete)"
assert call_args[3] == "Complete response"
assert call_args[4] == [1, 2, 3]
assert call_args[5] == "length"


def test_request_logger_log_outputs_with_truncation():
"""Test log_outputs respects max_log_len setting."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
# Set max_log_len to 10
request_logger = RequestLogger(max_log_len=10)

# Test output truncation
long_output = "This is a very long output that should be truncated"
long_token_ids = list(range(20)) # 20 tokens

request_logger.log_outputs(
request_id="test-truncate",
outputs=long_output,
output_token_ids=long_token_ids,
finish_reason="stop",
is_streaming=False,
delta=False,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args

# Check that output was truncated to first 10 characters
logged_output = call_args[0][3]
assert logged_output == "This is a "
assert len(logged_output) == 10

# Check that token IDs were truncated to first 10 tokens
logged_token_ids = call_args[0][4]
assert logged_token_ids == list(range(10))
assert len(logged_token_ids) == 10


def test_request_logger_log_outputs_none_values():
"""Test log_outputs handles None values correctly."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)

# Test with None output_token_ids
request_logger.log_outputs(
request_id="test-none",
outputs="Test output",
output_token_ids=None,
finish_reason="stop",
is_streaming=False,
delta=False,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-none"
assert call_args[3] == "Test output"
assert call_args[4] is None
assert call_args[5] == "stop"


def test_request_logger_log_outputs_empty_output():
"""Test log_outputs handles empty output correctly."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=5)

# Test with empty output
request_logger.log_outputs(
request_id="test-empty",
outputs="",
output_token_ids=[],
finish_reason="stop",
is_streaming=False,
delta=False,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args
assert "Generated response %s%s" in call_args[0]
assert call_args[1] == "test-empty"
assert call_args[3] == ""
assert call_args[4] == []
assert call_args[5] == "stop"


def test_request_logger_log_outputs_integration():
"""Test that log_outputs can be called alongside log_inputs."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)

# Test that both methods can be called without interference
request_logger.log_inputs(
request_id="test-integration",
prompt="Test prompt",
prompt_token_ids=[1, 2, 3],
prompt_embeds=None,
params=None,
lora_request=None,
prompt_adapter_request=None,
)

request_logger.log_outputs(
request_id="test-integration",
outputs="Test output",
output_token_ids=[4, 5, 6],
finish_reason="stop",
is_streaming=False,
delta=False,
)

# Should have been called twice - once for inputs, once for outputs
assert mock_logger.info.call_count == 2

# Check that the calls were made with correct patterns
input_call = mock_logger.info.call_args_list[0][0]
output_call = mock_logger.info.call_args_list[1][0]

assert "Received request %s" in input_call[0]
assert input_call[1] == "test-integration"

assert "Generated response %s%s" in output_call[0]
assert output_call[1] == "test-integration"


def test_streaming_complete_logs_full_text_content():
"""Test that streaming complete logging includes
full accumulated text, not just token count."""
mock_logger = MagicMock()

with patch("vllm.entrypoints.logger.logger", mock_logger):
request_logger = RequestLogger(max_log_len=None)

# Test with actual content instead of token count format
full_response = "This is a complete response from streaming"
request_logger.log_outputs(
request_id="test-streaming-full-text",
outputs=full_response,
output_token_ids=None,
finish_reason="streaming_complete",
is_streaming=True,
delta=False,
)

mock_logger.info.assert_called_once()
call_args = mock_logger.info.call_args.args

# Verify the logged output is the full text, not a token count format
logged_output = call_args[3]
assert logged_output == full_response
assert "tokens>" not in logged_output
assert "streaming_complete" not in logged_output

# Verify other parameters
assert call_args[1] == "test-streaming-full-text"
assert call_args[2] == " (streaming complete)"
assert call_args[5] == "streaming_complete"
34 changes: 34 additions & 0 deletions vllm/entrypoints/logger.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence
from typing import Optional, Union

import torch
Expand Down Expand Up @@ -48,3 +49,36 @@ def log_inputs(
prompt, params, prompt_token_ids,
prompt_embeds.shape if prompt_embeds is not None else None,
lora_request, prompt_adapter_request)

def log_outputs(
self,
request_id: str,
outputs: str,
output_token_ids: Optional[Sequence[int]],
finish_reason: Optional[str] = None,
is_streaming: bool = False,
delta: bool = False,
) -> None:
max_log_len = self.max_log_len
if max_log_len is not None:
if outputs is not None:
outputs = outputs[:max_log_len]

if output_token_ids is not None:
# Convert to list and apply truncation
output_token_ids = list(output_token_ids)[:max_log_len]

stream_info = ""
if is_streaming:
stream_info = (" (streaming delta)"
if delta else " (streaming complete)")

logger.info(
"Generated response %s%s: output: %r, "
"output_token_ids: %s, finish_reason: %s",
request_id,
stream_info,
outputs,
output_token_ids,
finish_reason,
)
Loading