[Frontend] OpenAI Responses API supports Tool/Function calling

chaunceyjiang · chaunceyjiang · commit f9d100befed6 · 2025-07-13T08:02:12.000Z
Signed-off-by: chaunceyjiang &lt;chaunceyjiang@gmail.com&gt;
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -17,9 +17,11 @@
 from openai.types.chat.chat_completion_message import (
     Annotation as OpenAIAnnotation)
 # yapf: enable
-from openai.types.responses import (ResponseInputParam, ResponseOutputItem,
+from openai.types.responses import (ResponseFunctionToolCall,
+                                    ResponseInputParam, ResponseOutputItem,
                                     ResponseOutputMessage, ResponsePrompt,
-                                    ResponseStatus, ResponseTextConfig)
+                                    ResponseStatus, ResponseTextConfig,
+                                    ToolChoiceFunction)
 from openai.types.responses.response import ToolChoice
 from openai.types.responses.tool import Tool
 from openai.types.shared import Metadata, Reasoning
@@ -314,16 +316,7 @@ def to_sampling_params(
             top_p = default_sampling_params.get(
                 "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
 
-        # Structured output
-        guided_decoding = None
-        if self.text is not None and self.text.format is not None:
-            response_format = self.text.format
-            if response_format.type == "json_schema":
-                guided_decoding = GuidedDecodingParams.from_optional(
-                    json=response_format.schema_)
-            elif response_format.type == "json_object":
-                raise NotImplementedError("json_object is not supported")
-
+        guided_decoding = self._get_guided_decoding()
         # TODO: add more parameters
         return SamplingParams.from_optional(
             temperature=temperature,
@@ -350,6 +343,97 @@ def validate_prompt(cls, data):
             raise ValueError("prompt template is not supported")
         return data
 
+    def _get_guided_json_from_tool(
+            self) -> Optional[Union[str, dict, BaseModel]]:
+        print(
+            f"Tool choice: {self.tool_choice}, type: {type(self.tool_choice)}")
+        # user has chosen to use a named tool
+        if type(self.tool_choice) is ToolChoiceFunction:
+            tool_name = self.tool_choice.name
+            tools = {tool.name: tool for tool in \
+                self.tools if tool.type == "function"}
+            if tool_name not in tools:
+                raise ValueError(
+                    f"Tool '{tool_name}' has not been passed in `tools`.")
+            tool = tools[tool_name]
+            print(f"Using tool '{tool_name}' for guided json decoding.")
+            print(f"Tool parameters: {tool.parameters}")
+            return tool.parameters
+
+        if self.tool_choice == "required":
+            # Pydantic schema generation cannot be used since the JSON schema
+            # has to be constructed for a specific instantiation of a tool list
+            # so that parameters of a function are correctly generated
+            # based on the chosen function name
+            def get_tool_schema(tool: ToolChoiceFunction) -> dict:
+                return {
+                    "properties": {
+                        "name": {
+                            "type": "string",
+                            "enum": [tool.name]
+                        },
+                        # parameters are always generated as '{}' in the final
+                        # output if they are missing from the request
+                        # (i.e. are None or '{}') so the schema is
+                        # updated to produce an empty object in that case
+                        "parameters": tool.parameters if tool.parameters else {
+                            "type": "object",
+                            "properties": {}
+                        }
+                    },
+                    "required": ["name", "parameters"]
+                }
+
+            def get_tool_schema_defs(tools: list[ToolChoiceFunction]) -> dict:
+                all_defs = dict[str, dict[str, Any]]()
+                for tool in tools:
+                    if tool.parameters is None:
+                        continue
+                    defs = tool.parameters.pop("$defs", {})
+                    for def_name, def_schema in defs.items():
+                        if def_name in all_defs and all_defs[
+                                def_name] != def_schema:
+                            raise ValueError(
+                                f"Tool definition '{def_name}' has "
+                                "multiple schemas, which is not "
+                                "supported.")
+                        else:
+                            all_defs[def_name] = def_schema
+                return all_defs
+
+            json_schema = {
+                "type": "array",
+                "minItems": 1,
+                "items": {
+                    "type": "object",
+                    "anyOf": [get_tool_schema(tool) for tool in self.tools]
+                }
+            }
+            json_schema_defs = get_tool_schema_defs(self.tools)
+            if json_schema_defs:
+                json_schema["$defs"] = json_schema_defs
+            print("Using tool choice 'required' for guided json decoding.")
+            print(f"JSON schema: {json_schema}")
+            return json_schema
+
+        return None
+
+    def _get_guided_decoding(self) -> Optional[GuidedDecodingParams]:
+        # Structured output
+        guided_decoding = None
+        if self.text is not None and self.text.format is not None:
+            response_format = self.text.format
+            if response_format.type == "json_schema":
+                guided_decoding = GuidedDecodingParams.from_optional(
+                    json=response_format.schema_)
+            elif response_format.type == "json_object":
+                raise NotImplementedError("json_object is not supported")
+        # Function call
+        elif self.tool_choice != "none" or self.tools is not None:
+            guided_decoding = GuidedDecodingParams.from_optional(
+                json=self._get_guided_json_from_tool())
+        return guided_decoding
+
 
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
@@ -1672,7 +1756,8 @@ class ResponsesResponse(OpenAIBaseModel):
     metadata: Optional[Metadata] = None
     model: str
     object: Literal["response"] = "response"
-    output: list[Union[ResponseOutputMessage, ResponseReasoningItem]]
+    output: list[Union[ResponseOutputMessage, ResponseReasoningItem,
+                       ResponseFunctionToolCall]]
     parallel_tool_calls: bool
     temperature: float
     tool_choice: ToolChoice
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
@@ -904,8 +904,11 @@ async def _preprocess_chat(
             request, "tool_choice") and request.tool_choice != "none")
 
         if should_parse_tools:
-            if not isinstance(request, ChatCompletionRequest):
-                msg = "Tool usage is only supported for Chat Completions API"
+            if not isinstance(request,
+                              ChatCompletionRequest) and not isinstance(
+                                  request, ResponsesRequest):
+                msg = "Tool usage is only supported for Chat Completions API " \
+                      "and Responses API requests."
                 raise NotImplementedError(msg)
 
             request = tool_parser(tokenizer).adjust_request(  # type: ignore
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -2,14 +2,18 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import asyncio
+import json
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from http import HTTPStatus
 from typing import Callable, Final, Optional, Union
 
 import jinja2
 from fastapi import Request
-from openai.types.responses import ResponseOutputMessage, ResponseOutputText
+from openai.types.responses import (ResponseFunctionToolCall,
+                                    ResponseOutputMessage, ResponseOutputText,
+                                    ToolChoiceFunction)
+from pydantic import TypeAdapter
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
@@ -18,7 +22,8 @@
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
-from vllm.entrypoints.openai.protocol import (ErrorResponse,
+from vllm.entrypoints.openai.protocol import (ErrorResponse, FunctionCall,
+                                              FunctionDefinition,
                                               PromptTokenUsageInfo,
                                               RequestResponseMetadata,
                                               ResponseReasoningItem,
@@ -27,12 +32,13 @@
 # yapf: enable
 from vllm.entrypoints.openai.serving_engine import OpenAIServing
 from vllm.entrypoints.openai.serving_models import OpenAIServingModels
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import random_uuid
+from vllm.utils import random_fc_uuid, random_uuid
 
 logger = init_logger(__name__)
 
@@ -64,7 +70,18 @@ def __init__(
             return_tokens_as_token_ids=return_tokens_as_token_ids,
             enable_force_include_usage=enable_force_include_usage,
         )
-
+        self.enable_auto_tools = enable_auto_tools
+        self.expand_tools_even_if_tool_choice_none = (
+            expand_tools_even_if_tool_choice_none)
+        self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
+        if self.enable_auto_tools:
+            try:
+                self.tool_parser = ToolParserManager.get_tool_parser(
+                    tool_parser)
+            except Exception as e:
+                raise TypeError("Error: --enable-auto-tool-choice requires "
+                                f"tool_parser:'{tool_parser}' which has not "
+                                "been registered") from e
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
 
@@ -140,11 +157,30 @@ async def create_responses(
             ) = self._maybe_get_adapters(request)
             model_name = self._get_model_name(request.model, lora_request)
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
-
+            if request.tools is None:
+                tool_dicts = None
+            elif (request.tool_choice == "none"
+                  and not self.expand_tools_even_if_tool_choice_none):
+                if len(request.tools) > 0:
+                    logger.warning_once(
+                        "Tools are specified but tool_choice is set to 'none' "
+                        "and --expand-tools-even-if-tool-choice-none is not "
+                        "enabled. Tool definitions will be excluded from the "
+                        "prompt. This behavior will change in vLLM v0.10 where "
+                        "tool definitions will be included by default even "
+                        "with tool_choice='none'. To adopt the new behavior "
+                        "now, use --expand-tools-even-if-tool-choice-none. "
+                        "To suppress this warning, either remove tools from "
+                        "the request or set tool_choice to a different value.")
+                tool_dicts = None
+            else:
+                tool_dicts = [tool.model_dump() for tool in request.tools]
             _, request_prompts, engine_prompts = await self._preprocess_chat(
                 request,
                 tokenizer,
                 messages,
+                tool_dicts=tool_dicts,
+                tool_parser=self.tool_parser,
                 chat_template=self.chat_template,
                 chat_template_content_format=self.chat_template_content_format,
             )
@@ -288,28 +324,82 @@ async def responses_full_generator(
             reasoning_content = None
             content = final_output.text
 
-        output = []
-        if reasoning_content:
-            reasoning_item = ResponseReasoningItem(
+        outputs = []
+        output = None
+        if self.tool_parser:
+            function_calls: list[FunctionCall] = []
+            if request.tool_choice and \
+                isinstance(request.tool_choice,
+                           ToolChoiceFunction):
+                # Forced Function Call
+                function_calls.append(
+                    FunctionCall(name=request.tool_choice.name,
+                                 arguments=content))
+            elif request.tool_choice is None or request.tool_choice == "none":
+                pass
+            elif request.tool_choice == "required":
+                tool_calls = TypeAdapter(
+                    list[FunctionDefinition]).validate_json(content)
+                function_calls.extend([
+                    FunctionCall(name=tool_call.name,
+                                 arguments=json.dumps(tool_call.parameters,
+                                                      ensure_ascii=False))
+                    for tool_call in tool_calls
+                ])
+            elif request.tool_choice == "auto":
+                try:
+                    tool_parser = self.tool_parser(tokenizer)
+                except RuntimeError as e:
+                    logger.exception("Error in tool parser creation.")
+                    return self.create_error_response(str(e))
+                tool_call_info = tool_parser.extract_tool_calls(
+                    content if content is not None else "", request=request)
+                if tool_call_info is not None and tool_call_info.tools_called:
+                    function_calls.extend(
+                        FunctionCall(
+                            name=tool_call.function.name,
+                            arguments=tool_call.function.arguments,
+                        ) for tool_call in tool_call_info.tool_calls)
+            else:
+                logger.warning(
+                    "Unknown tool choice: %s. "
+                    "Using 'none' as the default tool choice.",
+                    request.tool_choice)
+            output = [
+                ResponseFunctionToolCall(
+                    id=f"fc_{random_fc_uuid()}",
+                    call_id=f"call_{random_uuid()}",
+                    type="function_call",
+                    status="completed",
+                    name=tool_call.name,
+                    arguments=tool_call.arguments,
+                ) for tool_call in function_calls
+            ]
+        # If no tool call is generated, we still need to return an output.
+        if reasoning_content and output is None:
+            output = ResponseReasoningItem(
                 text=reasoning_content,
                 status=None,  # NOTE: Only the last output item has status.
             )
-            output.append(reasoning_item)
-        if content:
+        # If no tool call is generated, we still need to return an output.
+        if content and output is None:
             output_text = ResponseOutputText(
                 text=content,
                 annotations=[],  # TODO
                 type="output_text",
                 logprobs=None,  # TODO
             )
-            message = ResponseOutputMessage(
+            output = ResponseOutputMessage(
                 id=f"msg_{random_uuid()}",
                 content=[output_text],
                 role="assistant",
                 status="completed",
                 type="message",
             )
-            output.append(message)
+        if isinstance(output, list):
+            outputs.extend(output)
+        else:
+            outputs.append(output)
 
         # Calculate usage.
         assert final_res.prompt_token_ids is not None
@@ -330,7 +420,7 @@ async def responses_full_generator(
             sampling_params,
             model_name=model_name,
             created_time=created_time,
-            output=output,
+            output=outputs,
             status="completed",
             usage=usage,
         )
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -509,6 +509,11 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
+def random_fc_uuid() -> str:
+    """Generates a random UUID for function call tool outputs."""
+    return str(os.urandom(24).hex())
+
+
 class AsyncMicrobatchTokenizer:
     """Asynchronous tokenizer with micro-batching.