[Frontend] OpenAI Responses API supports input image (#20975)

chaunceyjiang · web-flow · commit 34cda778a091 · 2025-07-15T18:59:36.000-06:00
Signed-off-by: chaunceyjiang &lt;chaunceyjiang@gmail.com&gt;
diff --git a/tests/v1/entrypoints/openai/responses/test_image.py b/tests/v1/entrypoints/openai/responses/test_image.py
@@ -0,0 +1,166 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from vllm.multimodal.utils import encode_image_base64, fetch_image
+
+# Use a small vision model for testing
+MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
+MAXIMUM_IMAGES = 2
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest.fixture(scope="module")
+def default_image_server_args():
+    return [
+        "--enforce-eager",
+        "--max-model-len",
+        "6000",
+        "--max-num-seqs",
+        "128",
+        "--limit-mm-per-prompt",
+        json.dumps({"image": MAXIMUM_IMAGES}),
+    ]
+
+
+@pytest.fixture(scope="module")
+def image_server(default_image_server_args):
+    with RemoteOpenAIServer(MODEL_NAME,
+                            default_image_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(image_server):
+    async with image_server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_image() -> dict[str, str]:
+    return {
+        image_url: encode_image_base64(fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_image",
+                "image_url": image_url,
+                "detail": "auto",
+            },
+            {
+                "type": "input_text",
+                "text": content_text
+            },
+        ],
+    }]
+
+    # test image url
+    response = await client.responses.create(
+        model=model_name,
+        input=messages,
+    )
+    assert len(response.output_text) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+    client: openai.AsyncOpenAI,
+    model_name: str,
+    image_url: str,
+    base64_encoded_image: dict[str, str],
+):
+    content_text = "What's in this image?"
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "input_image",
+                "image_url":
+                f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
+                "detail": "auto",
+            },
+            {
+                "type": "input_text",
+                "text": content_text
+            },
+        ],
+    }]
+    # test image base64
+    response = await client.responses.create(
+        model=model_name,
+        input=messages,
+    )
+    assert len(response.output_text) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "image_urls",
+    [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
+async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
+                                 image_urls: list[str]):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "input_image",
+                "image_url": image_url,
+                "detail": "auto",
+            } for image_url in image_urls),
+            {
+                "type": "input_text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    if len(image_urls) > MAXIMUM_IMAGES:
+        with pytest.raises(openai.BadRequestError):  # test multi-image input
+            await client.responses.create(
+                model=model_name,
+                input=messages,
+            )
+        # the server should still work afterwards
+        response = await client.responses.create(
+            model=model_name,
+            input=[{
+                "role": "user",
+                "content": "What's the weather like in Paris today?",
+            }],
+        )
+        assert len(response.output_text) > 0
+    else:
+        response = await client.responses.create(
+            model=model_name,
+            input=messages,
+        )
+        assert len(response.output_text) > 0
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -28,6 +28,7 @@
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
+from openai.types.responses import ResponseInputImageParam
 from PIL import Image
 from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
@@ -942,6 +943,8 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
+_ResponsesInputImageParser = TypeAdapter(
+    ResponseInputImageParam).validate_python
 _ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 
 # Define a mapping from part types to their corresponding parsing functions.
@@ -953,6 +956,8 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
     lambda part: _TextParser(part).get("text", None),
     "input_text":
     lambda part: _TextParser(part).get("text", None),
+    "input_image":
+    lambda part: _ResponsesInputImageParser(part).get("image_url", None),
     "image_url":
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
@@ -1085,10 +1090,8 @@ def _parse_chat_message_content_part(
     """
     if isinstance(part, str):  # Handle plain text parts
         return part
-
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
-
     # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
     # content is None, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
@@ -1109,7 +1112,7 @@ def _parse_chat_message_content_part(
         image_content = cast(Image.Image, content)
         mm_parser.parse_image_pil(image_content)
         modality = "image"
-    elif part_type == "image_url":
+    elif part_type in ("image_url", "input_image"):
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)
         modality = "image"