Skip to content

Commit 34cda77

Browse files
[Frontend] OpenAI Responses API supports input image (#20975)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
1 parent 30800b0 commit 34cda77

File tree

2 files changed

+172
-3
lines changed

2 files changed

+172
-3
lines changed
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import json
5+
6+
import openai
7+
import pytest
8+
import pytest_asyncio
9+
10+
from tests.utils import RemoteOpenAIServer
11+
from vllm.multimodal.utils import encode_image_base64, fetch_image
12+
13+
# Use a small vision model for testing
14+
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
15+
MAXIMUM_IMAGES = 2
16+
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
17+
TEST_IMAGE_URLS = [
18+
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
19+
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
20+
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
21+
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
22+
]
23+
24+
25+
@pytest.fixture(scope="module")
26+
def default_image_server_args():
27+
return [
28+
"--enforce-eager",
29+
"--max-model-len",
30+
"6000",
31+
"--max-num-seqs",
32+
"128",
33+
"--limit-mm-per-prompt",
34+
json.dumps({"image": MAXIMUM_IMAGES}),
35+
]
36+
37+
38+
@pytest.fixture(scope="module")
39+
def image_server(default_image_server_args):
40+
with RemoteOpenAIServer(MODEL_NAME,
41+
default_image_server_args) as remote_server:
42+
yield remote_server
43+
44+
45+
@pytest_asyncio.fixture
46+
async def client(image_server):
47+
async with image_server.get_async_client() as async_client:
48+
yield async_client
49+
50+
51+
@pytest.fixture(scope="session")
52+
def base64_encoded_image() -> dict[str, str]:
53+
return {
54+
image_url: encode_image_base64(fetch_image(image_url))
55+
for image_url in TEST_IMAGE_URLS
56+
}
57+
58+
59+
@pytest.mark.asyncio
60+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
61+
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
62+
async def test_single_chat_session_image(client: openai.AsyncOpenAI,
63+
model_name: str, image_url: str):
64+
content_text = "What's in this image?"
65+
messages = [{
66+
"role":
67+
"user",
68+
"content": [
69+
{
70+
"type": "input_image",
71+
"image_url": image_url,
72+
"detail": "auto",
73+
},
74+
{
75+
"type": "input_text",
76+
"text": content_text
77+
},
78+
],
79+
}]
80+
81+
# test image url
82+
response = await client.responses.create(
83+
model=model_name,
84+
input=messages,
85+
)
86+
assert len(response.output_text) > 0
87+
88+
89+
@pytest.mark.asyncio
90+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
91+
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
92+
async def test_single_chat_session_image_base64encoded(
93+
client: openai.AsyncOpenAI,
94+
model_name: str,
95+
image_url: str,
96+
base64_encoded_image: dict[str, str],
97+
):
98+
content_text = "What's in this image?"
99+
messages = [{
100+
"role":
101+
"user",
102+
"content": [
103+
{
104+
"type": "input_image",
105+
"image_url":
106+
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}",
107+
"detail": "auto",
108+
},
109+
{
110+
"type": "input_text",
111+
"text": content_text
112+
},
113+
],
114+
}]
115+
# test image base64
116+
response = await client.responses.create(
117+
model=model_name,
118+
input=messages,
119+
)
120+
assert len(response.output_text) > 0
121+
122+
123+
@pytest.mark.asyncio
124+
@pytest.mark.parametrize("model_name", [MODEL_NAME])
125+
@pytest.mark.parametrize(
126+
"image_urls",
127+
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
128+
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
129+
image_urls: list[str]):
130+
messages = [{
131+
"role":
132+
"user",
133+
"content": [
134+
*({
135+
"type": "input_image",
136+
"image_url": image_url,
137+
"detail": "auto",
138+
} for image_url in image_urls),
139+
{
140+
"type": "input_text",
141+
"text": "What's in this image?"
142+
},
143+
],
144+
}]
145+
146+
if len(image_urls) > MAXIMUM_IMAGES:
147+
with pytest.raises(openai.BadRequestError): # test multi-image input
148+
await client.responses.create(
149+
model=model_name,
150+
input=messages,
151+
)
152+
# the server should still work afterwards
153+
response = await client.responses.create(
154+
model=model_name,
155+
input=[{
156+
"role": "user",
157+
"content": "What's the weather like in Paris today?",
158+
}],
159+
)
160+
assert len(response.output_text) > 0
161+
else:
162+
response = await client.responses.create(
163+
model=model_name,
164+
input=messages,
165+
)
166+
assert len(response.output_text) > 0

vllm/entrypoints/chat_utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
ChatCompletionToolMessageParam)
2929
from openai.types.chat.chat_completion_content_part_input_audio_param import (
3030
InputAudio)
31+
from openai.types.responses import ResponseInputImageParam
3132
from PIL import Image
3233
from pydantic import BaseModel, ConfigDict, TypeAdapter
3334
# yapf: enable
@@ -942,6 +943,8 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
942943
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
943944
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
944945

946+
_ResponsesInputImageParser = TypeAdapter(
947+
ResponseInputImageParam).validate_python
945948
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
946949

947950
# Define a mapping from part types to their corresponding parsing functions.
@@ -953,6 +956,8 @@ def _get_full_multimodal_text_prompt(placeholder_storage: dict[str, list],
953956
lambda part: _TextParser(part).get("text", None),
954957
"input_text":
955958
lambda part: _TextParser(part).get("text", None),
959+
"input_image":
960+
lambda part: _ResponsesInputImageParser(part).get("image_url", None),
956961
"image_url":
957962
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
958963
"image_embeds":
@@ -1085,10 +1090,8 @@ def _parse_chat_message_content_part(
10851090
"""
10861091
if isinstance(part, str): # Handle plain text parts
10871092
return part
1088-
10891093
# Handle structured dictionary parts
10901094
part_type, content = _parse_chat_message_content_mm_part(part)
1091-
10921095
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
10931096
# content is None, log a warning and skip
10941097
if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
@@ -1109,7 +1112,7 @@ def _parse_chat_message_content_part(
11091112
image_content = cast(Image.Image, content)
11101113
mm_parser.parse_image_pil(image_content)
11111114
modality = "image"
1112-
elif part_type == "image_url":
1115+
elif part_type in ("image_url", "input_image"):
11131116
str_content = cast(str, content)
11141117
mm_parser.parse_image(str_content)
11151118
modality = "image"

0 commit comments

Comments
 (0)