Skip to content

Commit 462b269

Browse files
authored
Implement OpenAI Responses API [1/N] (#20504)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
1 parent c18b3b8 commit 462b269

File tree

12 files changed

+1106
-8
lines changed

12 files changed

+1106
-8
lines changed

tests/entrypoints/openai/test_openai_schema.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@ def test_openapi_stateless(case: schemathesis.Case):
9595
case.operation.method.upper(),
9696
case.operation.path,
9797
)
98+
if case.operation.path.startswith("/v1/responses"):
99+
# Skip responses API as it is meant to be stateful.
100+
return
101+
98102
timeout = {
99103
# requires a longer timeout
100104
("POST", "/v1/chat/completions"):

tests/v1/entrypoints/openai/responses/__init__.py

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import pytest
4+
import pytest_asyncio
5+
6+
from tests.utils import RemoteOpenAIServer
7+
8+
# Use a small reasoning model to test the responses API.
9+
MODEL_NAME = "Qwen/Qwen3-0.6B"
10+
11+
12+
@pytest.fixture(scope="module")
13+
def default_server_args():
14+
return [
15+
"--max-model-len",
16+
"8192",
17+
"--enforce-eager", # For faster startup.
18+
"--reasoning-parser",
19+
"deepseek_r1",
20+
]
21+
22+
23+
@pytest.fixture(scope="module")
24+
def server(default_server_args):
25+
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
26+
yield remote_server
27+
28+
29+
@pytest_asyncio.fixture
30+
async def client(server):
31+
async with server.get_async_client() as async_client:
32+
yield async_client
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import openai # use the official client for correctness check
5+
import pytest
6+
7+
8+
@pytest.mark.asyncio
9+
async def test_simple_input(client: openai.AsyncOpenAI):
10+
response = await client.responses.create(input="What is 13 * 24?")
11+
print(response)
12+
13+
outputs = response.output
14+
# Whether the output contains the answer.
15+
assert outputs[-1].type == "message"
16+
assert "312" in outputs[-1].content[0].text
17+
18+
# Whether the output contains the reasoning.
19+
assert outputs[0].type == "reasoning"
20+
assert outputs[0].text != ""
21+
22+
23+
@pytest.mark.asyncio
24+
async def test_instructions(client: openai.AsyncOpenAI):
25+
response = await client.responses.create(
26+
instructions="Finish the answer with QED.",
27+
input="What is 13 * 24?",
28+
)
29+
print(response)
30+
31+
output_text = response.output[-1].content[0].text
32+
assert "312" in output_text
33+
assert "QED" in output_text
34+
35+
36+
@pytest.mark.asyncio
37+
async def test_chat(client: openai.AsyncOpenAI):
38+
response = await client.responses.create(input=[
39+
{
40+
"role": "system",
41+
"content": "Finish the answer with QED."
42+
},
43+
{
44+
"role": "user",
45+
"content": "What is 5 * 3?"
46+
},
47+
{
48+
"role": "assistant",
49+
"content": "15. QED."
50+
},
51+
{
52+
"role": "user",
53+
"content": "Multiply the result by 2."
54+
},
55+
], )
56+
print(response)
57+
58+
output_text = response.output[-1].content[0].text
59+
assert "30" in output_text
60+
assert "QED" in output_text
61+
62+
63+
@pytest.mark.asyncio
64+
async def test_chat_with_input_type(client: openai.AsyncOpenAI):
65+
response = await client.responses.create(input=[
66+
{
67+
"role": "user",
68+
"content": [{
69+
"type": "input_text",
70+
"text": "Hello!"
71+
}],
72+
},
73+
], )
74+
print(response)
75+
assert response.status == "completed"
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import asyncio
4+
5+
import openai
6+
import pytest
7+
8+
9+
@pytest.mark.asyncio
10+
async def test_store(client: openai.AsyncOpenAI):
11+
# By default, store is True.
12+
response = await client.responses.create(input="Hello!")
13+
assert response.status == "completed"
14+
15+
# Retrieve the response.
16+
response = await client.responses.retrieve(response.id)
17+
assert response.status == "completed"
18+
19+
# Test store=False.
20+
response = await client.responses.create(
21+
input="Hello!",
22+
store=False,
23+
)
24+
assert response.status == "completed"
25+
26+
# The response should not be found.
27+
with pytest.raises(openai.NotFoundError,
28+
match="Response with id .* not found."):
29+
await client.responses.retrieve(response.id)
30+
31+
32+
@pytest.mark.asyncio
33+
async def test_background(client: openai.AsyncOpenAI):
34+
# NOTE: This query should be easy enough for the model to answer
35+
# within the 10 seconds.
36+
response = await client.responses.create(
37+
input="Hello!",
38+
background=True,
39+
)
40+
assert response.status == "queued"
41+
42+
max_retries = 10
43+
for _ in range(max_retries):
44+
await asyncio.sleep(1)
45+
response = await client.responses.retrieve(response.id)
46+
if response.status != "queued":
47+
break
48+
print(response)
49+
50+
assert response.status == "completed"
51+
52+
53+
@pytest.mark.asyncio
54+
async def test_background_error(client: openai.AsyncOpenAI):
55+
with pytest.raises(
56+
openai.BadRequestError,
57+
match="background can only be used when `store` is true"):
58+
_ = await client.responses.create(
59+
input="What is 13 * 24?",
60+
background=True,
61+
store=False,
62+
)
63+
64+
65+
@pytest.mark.asyncio
66+
async def test_background_cancel(client: openai.AsyncOpenAI):
67+
response = await client.responses.create(
68+
input="Write a long story about a cat.",
69+
background=True,
70+
)
71+
assert response.status == "queued"
72+
73+
# Cancel the response before it is completed.
74+
# FIXME: This test can be flaky.
75+
await asyncio.sleep(0.5)
76+
response = await client.responses.cancel(response.id)
77+
assert response.status == "cancelled"
78+
79+
# Make sure the response status remains unchanged.
80+
await asyncio.sleep(5)
81+
response = await client.responses.retrieve(response.id)
82+
assert response.status == "cancelled"
83+
84+
85+
@pytest.mark.asyncio
86+
async def test_cancel_completed(client: openai.AsyncOpenAI):
87+
response = await client.responses.create(input="Hello")
88+
assert response.status == "completed"
89+
90+
with pytest.raises(openai.BadRequestError,
91+
match="Cannot cancel a synchronous response."):
92+
await client.responses.cancel(response.id)
93+
94+
95+
@pytest.mark.asyncio
96+
async def test_previous_response_id(client: openai.AsyncOpenAI):
97+
response1 = await client.responses.create(
98+
instructions="You are tested on your ability to retrieve the correct "
99+
"information from the previous response.",
100+
input="Hello, my name is John.")
101+
102+
response2 = await client.responses.create(
103+
input="Actually, my name is not John. My real name is Mark.",
104+
previous_response_id=response1.id,
105+
)
106+
107+
response3 = await client.responses.create(
108+
input="What is my real name again? Answer in one word.",
109+
previous_response_id=response2.id,
110+
)
111+
print(response3)
112+
assert "Mark" in response3.output[-1].content[0].text
113+
assert "John" not in response3.output[-1].content[0].text
114+
115+
116+
@pytest.mark.asyncio
117+
async def test_two_responses_with_same_prev_id(client: openai.AsyncOpenAI):
118+
response1 = await client.responses.create(
119+
instructions="You are tested on your ability to retrieve the correct "
120+
"information from the previous response.",
121+
input="Hello, my name is John.")
122+
123+
# Both response 2 and 3 use response 1 as the previous response.
124+
response2 = client.responses.create(
125+
input="Actually, my name is not John. My name is Mark.",
126+
previous_response_id=response1.id,
127+
)
128+
response3 = client.responses.create(
129+
input="What is my name again? Answer in one word.",
130+
previous_response_id=response1.id,
131+
)
132+
133+
_ = await response2
134+
response3_result = await response3
135+
print(response3_result)
136+
assert "John" in response3_result.output[-1].content[0].text
137+
assert "Mark" not in response3_result.output[-1].content[0].text
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
import json
4+
5+
import openai
6+
import pytest
7+
from pydantic import BaseModel
8+
9+
10+
@pytest.mark.asyncio
11+
async def test_structured_output(client: openai.AsyncOpenAI):
12+
response = await client.responses.create(
13+
input=[
14+
{
15+
"role": "system",
16+
"content": "Extract the event information."
17+
},
18+
{
19+
"role": "user",
20+
"content":
21+
"Alice and Bob are going to a science fair on Friday.",
22+
},
23+
],
24+
text={
25+
"format": {
26+
"type": "json_schema",
27+
"name": "calendar_event",
28+
"schema": {
29+
"type": "object",
30+
"properties": {
31+
"event_name": {
32+
"type": "string"
33+
},
34+
"date": {
35+
"type": "string"
36+
},
37+
"participants": {
38+
"type": "array",
39+
"items": {
40+
"type": "string"
41+
}
42+
},
43+
},
44+
"required": ["event_name", "date", "participants"],
45+
"additionalProperties": False,
46+
},
47+
"description": "A calendar event.",
48+
"strict": True,
49+
}
50+
},
51+
)
52+
print(response)
53+
54+
# NOTE: The JSON schema is applied to the output text, not reasoning.
55+
output_text = response.output[-1].content[0].text
56+
event = json.loads(output_text)
57+
58+
assert event["event_name"].lower() == "science fair"
59+
assert event["date"] == "Friday"
60+
participants = event["participants"]
61+
assert len(participants) == 2
62+
assert participants[0] == "Alice"
63+
assert participants[1] == "Bob"
64+
65+
66+
@pytest.mark.asyncio
67+
async def test_structured_output_with_parse(client: openai.AsyncOpenAI):
68+
69+
class CalendarEvent(BaseModel):
70+
event_name: str
71+
date: str
72+
participants: list[str]
73+
74+
response = await client.responses.parse(
75+
model=None,
76+
instructions="Extract the event information.",
77+
input="Alice and Bob are going to a science fair on Friday.",
78+
text_format=CalendarEvent,
79+
)
80+
print(response)
81+
82+
# The output is successfully parsed.
83+
event = response.output_parsed
84+
assert event is not None
85+
86+
# The output is correct.
87+
assert event.event_name.lower() == "science fair"
88+
assert event.date == "Friday"
89+
participants = event.participants
90+
assert len(participants) == 2
91+
assert participants[0] == "Alice"
92+
assert participants[1] == "Bob"

vllm/entrypoints/chat_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
902902
] = {
903903
"text":
904904
lambda part: _TextParser(part).get("text", None),
905+
"input_text":
906+
lambda part: _TextParser(part).get("text", None),
905907
"image_url":
906908
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
907909
"image_embeds":
@@ -1040,7 +1042,7 @@ def _parse_chat_message_content_part(
10401042
"with empty / unparsable content.", part, part_type)
10411043
return None
10421044

1043-
if part_type in ("text", "refusal"):
1045+
if part_type in ("text", "input_text", "refusal"):
10441046
str_content = cast(str, content)
10451047
if wrap_dicts:
10461048
return {'type': 'text', 'text': str_content}

0 commit comments

Comments
 (0)