diff --git a/examples/online_serving/openai_responses_client_with_tools.py b/examples/online_serving/openai_responses_client_with_tools.py new file mode 100644 index 00000000000..95072c5bddf --- /dev/null +++ b/examples/online_serving/openai_responses_client_with_tools.py @@ -0,0 +1,81 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Set up this example by starting a vLLM OpenAI-compatible server with tool call +options enabled. +Reasoning models can be used through the Responses API as seen here +https://platform.openai.com/docs/api-reference/responses + +For example: + +vllm serve Qwen/Qwen3-1.7B --reasoning-parser qwen3 \ + --guided-decoding-backend xgrammar \ + --enable-auto-tool-choice --tool-call-parser hermes +""" + +import json + +from openai import OpenAI + + +def get_weather(latitude: float, longitude: float) -> str: + """ + Mock function to simulate getting weather data. + In a real application, this would call an external weather API. + """ + return f"Current temperature at ({latitude}, {longitude}) is 20°C." + + +tools = [ + { + "type": "function", + "name": "get_weather", + "description": "Get current temperature for provided coordinates in celsius.", + "parameters": { + "type": "object", + "properties": { + "latitude": {"type": "number"}, + "longitude": {"type": "number"}, + }, + "required": ["latitude", "longitude"], + "additionalProperties": False, + }, + "strict": True, + } +] + +input_messages = [ + {"role": "user", "content": "What's the weather like in Paris today?"} +] + + +def main(): + base_url = "http://0.0.0.0:8000/v1" + model = "Qwen/Qwen3-1.7B" + client = OpenAI(base_url=base_url, api_key="empty") + response = client.responses.create( + model=model, input=input_messages, tools=tools, tool_choice="required" + ) + tool_call = response.output[0] + args = json.loads(tool_call.arguments) + + result = get_weather(args["latitude"], args["longitude"]) + + input_messages.append(tool_call) # append model's function call message + input_messages.append( + { # append result message + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result), + } + ) + response_2 = client.responses.create( + model=model, + input=input_messages, + tools=tools, + ) + print(response_2.output_text) + + +if __name__ == "__main__": + main() diff --git a/tests/v1/entrypoints/openai/responses/conftest.py b/tests/v1/entrypoints/openai/responses/conftest.py index 2dcdda04ecb..471f016227a 100644 --- a/tests/v1/entrypoints/openai/responses/conftest.py +++ b/tests/v1/entrypoints/openai/responses/conftest.py @@ -15,8 +15,13 @@ def default_server_args(): "--max-model-len", "8192", "--enforce-eager", # For faster startup. + "--enable-auto-tool-choice", + "--guided-decoding-backend", + "xgrammar", + "--tool-call-parser", + "hermes", "--reasoning-parser", - "deepseek_r1", + "qwen3", ] diff --git a/tests/v1/entrypoints/openai/responses/test_function_call.py b/tests/v1/entrypoints/openai/responses/test_function_call.py new file mode 100644 index 00000000000..9cea8c56d48 --- /dev/null +++ b/tests/v1/entrypoints/openai/responses/test_function_call.py @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import openai # use the official client for correctness check +import pytest + +MODEL_NAME = "Qwen/Qwen3-0.6B" +tools = [ + { + "type": "function", + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to find the weather for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + "options": { + "$ref": "#/$defs/WeatherOptions", + "description": "Optional parameters for weather query", + }, + }, + "required": ["country", "unit"], + "$defs": { + "WeatherOptions": { + "title": "WeatherOptions", + "type": "object", + "additionalProperties": False, + "properties": { + "unit": { + "type": "string", + "enum": ["celsius", "fahrenheit"], + "default": "celsius", + "description": "Temperature unit", + "title": "Temperature Unit", + }, + "include_forecast": { + "type": "boolean", + "default": False, + "description": + "Whether to include a 24-hour forecast", + "title": "Include Forecast", + }, + "language": { + "type": "string", + "default": "zh-CN", + "description": "Language of the response", + "title": "Language", + "enum": ["zh-CN", "en-US", "ja-JP"], + }, + }, + }, + }, + }, + }, + { + "type": "function", + "name": "get_forecast", + "description": "Get the weather forecast for a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": "string", + "description": + "The city to get the forecast for, e.g. 'Vienna'", + "default": "Vienna", + }, + "country": { + "type": + "string", + "description": + "The country that the city is in, e.g. 'Austria'", + }, + "days": { + "type": "integer", + "description": + "Number of days to get the forecast for (1-7)", + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"], + }, + }, + "required": ["country", "days", "unit"], + }, + }, +] + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("tool_choice", ["auto", "required"]) +async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str, + tool_choice: str): + prompt = [{ + "role": "user", + "content": + "Can you tell me what the current weather is in Berlin and the "\ + "forecast for the next 5 days, in fahrenheit?", + },] + response = await client.responses.create( + model=model_name, + input=prompt, + tools=tools, + tool_choice=tool_choice, + ) + + assert len(response.output) >= 1 + tool_call = response.output[0] + + assert tool_call.type == "function_call" + assert json.loads(tool_call.arguments) is not None + + +@pytest.mark.asyncio +async def test_named_tool_use(client: openai.AsyncOpenAI): + + def get_weather(latitude: float, longitude: float) -> str: + """ + Mock function to simulate getting weather data. + In a real application, this would call an external weather API. + """ + return f"Current temperature at ({latitude}, {longitude}) is 20°C." + + tools = [{ + "type": "function", + "name": "get_weather", + "description": + "Get current temperature for provided coordinates in celsius.", + "parameters": { + "type": "object", + "properties": { + "latitude": { + "type": "number" + }, + "longitude": { + "type": "number" + } + }, + "required": ["latitude", "longitude"], + "additionalProperties": False + }, + "strict": True + }] + + input_messages = [{ + "role": "user", + "content": "What's the weather like in Paris today?" + }] + + response = await client.responses.create(model=MODEL_NAME, + input=input_messages, + tools=tools, + tool_choice={ + "type": "function", + "name": "get_weather" + }) + assert len(response.output) == 1 + tool_call = response.output[0] + assert tool_call.type == "function_call" + assert tool_call.name == "get_weather" + args = json.loads(tool_call.arguments) + assert args["latitude"] is not None + assert args["longitude"] is not None + # call the tool + result = get_weather(args["latitude"], args["longitude"]) + input_messages.append(tool_call) # append model's function call message + input_messages.append({ # append result message + "type": "function_call_output", + "call_id": tool_call.call_id, + "output": str(result) + }) + # create a new response with the tool call result + response_2 = await client.responses.create(model=MODEL_NAME, + input=input_messages) + # check the output + assert len(response_2.output_text) > 0 diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index f17faa23d01..f5bea4ec897 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -17,9 +17,11 @@ from openai.types.chat.chat_completion_message import ( Annotation as OpenAIAnnotation) # yapf: enable -from openai.types.responses import (ResponseInputParam, ResponseOutputItem, +from openai.types.responses import (ResponseFunctionToolCall, + ResponseInputParam, ResponseOutputItem, ResponseOutputMessage, ResponsePrompt, - ResponseStatus, ResponseTextConfig) + ResponseStatus, ResponseTextConfig, + ToolChoiceFunction) from openai.types.responses.response import ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning @@ -324,16 +326,7 @@ def to_sampling_params( top_p = default_sampling_params.get( "top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"]) - # Structured output - guided_decoding = None - if self.text is not None and self.text.format is not None: - response_format = self.text.format - if response_format.type == "json_schema": - guided_decoding = GuidedDecodingParams.from_optional( - json=response_format.schema_) - elif response_format.type == "json_object": - raise NotImplementedError("json_object is not supported") - + guided_decoding = self._get_guided_decoding() # TODO: add more parameters return SamplingParams.from_optional( temperature=temperature, @@ -373,6 +366,91 @@ def check_cache_salt_support(cls, data): "non-empty string if provided.") return data + def _get_guided_json_from_tool( + self) -> Optional[Union[str, dict, BaseModel]]: + # user has chosen to use a named tool + if isinstance(self.tool_choice, ToolChoiceFunction): + tool_name = self.tool_choice.name + tools = {tool.name: tool for tool in \ + self.tools if tool.type == "function"} + if tool_name not in tools: + raise ValueError( + f"Tool '{tool_name}' has not been passed in `tools`.") + tool = tools[tool_name] + return tool.parameters + + if self.tool_choice == "required": + # Pydantic schema generation cannot be used since the JSON schema + # has to be constructed for a specific instantiation of a tool list + # so that parameters of a function are correctly generated + # based on the chosen function name + def get_tool_schema(tool: ToolChoiceFunction) -> dict: + return { + "properties": { + "name": { + "type": "string", + "enum": [tool.name] + }, + # parameters are always generated as '{}' in the final + # output if they are missing from the request + # (i.e. are None or '{}') so the schema is + # updated to produce an empty object in that case + "parameters": tool.parameters if tool.parameters else { + "type": "object", + "properties": {} + } + }, + "required": ["name", "parameters"] + } + + def get_tool_schema_defs(tools: list[ToolChoiceFunction]) -> dict: + all_defs = dict[str, dict[str, Any]]() + for tool in tools: + if tool.parameters is None: + continue + defs = tool.parameters.pop("$defs", {}) + for def_name, def_schema in defs.items(): + if def_name in all_defs and all_defs[ + def_name] != def_schema: + raise ValueError( + f"Tool definition '{def_name}' has " + "multiple schemas, which is not " + "supported.") + else: + all_defs[def_name] = def_schema + return all_defs + + json_schema = { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "anyOf": [get_tool_schema(tool) for tool in self.tools] + } + } + json_schema_defs = get_tool_schema_defs(self.tools) + if json_schema_defs: + json_schema["$defs"] = json_schema_defs + return json_schema + + return None + + def _get_guided_decoding(self) -> Optional[GuidedDecodingParams]: + # Structured output + guided_decoding = None + if self.text is not None and self.text.format is not None: + response_format = self.text.format + if response_format.type == "json_schema": + guided_decoding = GuidedDecodingParams.from_optional( + json=response_format.schema_) + elif response_format.type == "json_object": + raise NotImplementedError("json_object is not supported") + # Function call + elif not (self.tool_choice == "none" or self.tools is None): + guided_decoding = GuidedDecodingParams.from_optional( + json=self._get_guided_json_from_tool()) + return guided_decoding + class ChatCompletionRequest(OpenAIBaseModel): # Ordered by official OpenAI API documentation @@ -1719,7 +1797,8 @@ class ResponsesResponse(OpenAIBaseModel): metadata: Optional[Metadata] = None model: str object: Literal["response"] = "response" - output: list[Union[ResponseOutputMessage, ResponseReasoningItem]] + output: list[Union[ResponseOutputMessage, ResponseReasoningItem, + ResponseFunctionToolCall]] parallel_tool_calls: bool temperature: float tool_choice: ToolChoice diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b902166a25b..70a6b3b8982 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -6,7 +6,7 @@ import time from collections.abc import AsyncGenerator, AsyncIterator from collections.abc import Sequence as GenericSequence -from typing import Callable, Final, Optional, Union +from typing import Final, Optional, Union import jinja2 import partial_json_parser @@ -31,13 +31,12 @@ from vllm.entrypoints.openai.serving_engine import (OpenAIServing, clamp_prompt_logprobs) from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager +from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import ( MistralToolCall) from vllm.entrypoints.utils import get_max_tokens from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput -from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer @@ -78,39 +77,13 @@ def __init__( self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format - # set up tool use self.enable_auto_tools: bool = enable_auto_tools - if self.enable_auto_tools: - logger.info( - "\"auto\" tool choice has been enabled please note that while" - " the parallel_tool_calls client option is preset for " - "compatibility reasons, it will be ignored.") - - self.reasoning_parser: Optional[Callable[[AnyTokenizer], - ReasoningParser]] = None - if reasoning_parser: - try: - self.reasoning_parser = ( - ReasoningParserManager.get_reasoning_parser( - reasoning_parser)) - assert self.reasoning_parser is not None - except Exception as e: - raise TypeError( - f"{reasoning_parser=} has not been registered") from e - self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None - if self.enable_auto_tools: - try: - if (tool_parser == "pythonic" and - model_config.model.startswith("meta-llama/Llama-3.2")): - logger.warning( - "Llama3.2 models may struggle to emit valid pythonic" - " tool calls") - self.tool_parser = ToolParserManager.get_tool_parser( - tool_parser) - except Exception as e: - raise TypeError("Error: --enable-auto-tool-choice requires " - f"tool_parser:'{tool_parser}' which has not " - "been registered") from e + # set up reasoning parser + self.reasoning_parser = self._get_reasoning_parser( + reasoning_parser_name=reasoning_parser) + # set up tool use + self.tool_parser = self._get_tool_parser( + tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools) self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 462317a0878..12b8dd4f8a8 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -57,7 +57,7 @@ TranscriptionResponse, TranslationRequest) from vllm.entrypoints.openai.serving_models import OpenAIServingModels -from vllm.entrypoints.openai.tool_parsers import ToolParser +from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager # yapf: enable from vllm.inputs.data import EmbedsPrompt as EngineEmbedsPrompt from vllm.inputs.data import TokensPrompt as EngineTokensPrompt @@ -69,6 +69,7 @@ from vllm.outputs import PoolingRequestOutput, RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.sequence import Logprob, PromptLogprobs from vllm.tracing import (contains_trace_headers, extract_trace_headers, @@ -898,7 +899,6 @@ async def _preprocess_chat( model_config=model_config, **_chat_template_kwargs, ) - mm_data = await mm_data_future # tool parsing is done only if a tool_parser has been set and if @@ -908,8 +908,11 @@ async def _preprocess_chat( request, "tool_choice") and request.tool_choice != "none") if should_parse_tools: - if not isinstance(request, ChatCompletionRequest): - msg = "Tool usage is only supported for Chat Completions API" + if not isinstance(request, + ChatCompletionRequest) and not isinstance( + request, ResponsesRequest): + msg = "Tool usage is only supported for Chat Completions API " \ + "and Responses API requests." raise NotImplementedError(msg) request = tool_parser(tokenizer).adjust_request( # type: ignore @@ -1054,6 +1057,51 @@ def _get_model_name(self, return self.models.base_model_paths[0].name return model_name + def _get_tool_parser( + self, + tool_parser_name: Optional[str] = None, + enable_auto_tools: bool = False + ) -> Optional[Callable[[AnyTokenizer], ToolParser]]: + """Get the tool parser based on the name.""" + parser = None + if not enable_auto_tools or tool_parser_name is None: + return parser + logger.info( + "\"auto\" tool choice has been enabled please note that while" + " the parallel_tool_calls client option is preset for " + "compatibility reasons, it will be ignored.") + """Get the tool parser based on the name.""" + try: + if (tool_parser_name == "pythonic" + and self.model_config.model.startswith( + "meta-llama/Llama-3.2")): + logger.warning( + "Llama3.2 models may struggle to emit valid pythonic" + " tool calls") + parser = ToolParserManager.get_tool_parser(tool_parser_name) + except Exception as e: + raise TypeError("Error: --enable-auto-tool-choice requires " + f"tool_parser:'{tool_parser_name}' which has not " + "been registered") from e + return parser + + def _get_reasoning_parser( + self, + reasoning_parser_name: str, + ) -> Optional[Callable[[AnyTokenizer], ReasoningParser]]: + """Get the reasoning parser based on the name.""" + parser = None + if not reasoning_parser_name: + return None + try: + parser = (ReasoningParserManager.get_reasoning_parser( + reasoning_parser_name)) + assert parser is not None + except Exception as e: + raise TypeError( + f"{reasoning_parser_name=} has not been registered") from e + return parser + def clamp_prompt_logprobs( prompt_logprobs: Union[PromptLogprobs, diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index f7bde6e243b..ec2ab9db0ca 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -2,14 +2,23 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import asyncio +import json import time from collections.abc import AsyncGenerator, AsyncIterator from http import HTTPStatus -from typing import Callable, Final, Optional, Union +from typing import Final, Optional, Union import jinja2 from fastapi import Request -from openai.types.responses import ResponseOutputMessage, ResponseOutputText +from openai.types.chat import (ChatCompletionAssistantMessageParam, + ChatCompletionMessageToolCallParam, + ChatCompletionToolMessageParam) +from openai.types.chat.chat_completion_message_tool_call_param import ( + Function as FunctionCallTool) +from openai.types.responses import (ResponseFunctionToolCall, + ResponseOutputMessage, ResponseOutputText, + ToolChoiceFunction) +from pydantic import TypeAdapter from vllm.config import ModelConfig from vllm.engine.protocol import EngineClient @@ -18,7 +27,8 @@ from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable -from vllm.entrypoints.openai.protocol import (ErrorResponse, +from vllm.entrypoints.openai.protocol import (ErrorResponse, FunctionCall, + FunctionDefinition, PromptTokenUsageInfo, RequestResponseMetadata, ResponseReasoningItem, @@ -29,10 +39,9 @@ from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.reasoning import ReasoningParser, ReasoningParserManager from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import AnyTokenizer -from vllm.utils import random_uuid +from vllm.utils import random_fc_uuid, random_uuid logger = init_logger(__name__) @@ -63,22 +72,15 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids, enable_force_include_usage=enable_force_include_usage, ) - + self.enable_auto_tools = enable_auto_tools self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format - self.reasoning_parser: Optional[Callable[[AnyTokenizer], - ReasoningParser]] = None - if reasoning_parser: - try: - self.reasoning_parser = ( - ReasoningParserManager.get_reasoning_parser( - reasoning_parser)) - assert self.reasoning_parser is not None - except Exception as e: - raise TypeError( - f"{reasoning_parser=} has not been registered") from e + self.reasoning_parser = self._get_reasoning_parser( + reasoning_parser_name=reasoning_parser) + self.tool_parser = self._get_tool_parser( + tool_parser_name=tool_parser, enable_auto_tools=enable_auto_tools) self.enable_prompt_tokens_details = enable_prompt_tokens_details self.enable_force_include_usage = enable_force_include_usage self.default_sampling_params = ( @@ -139,11 +141,16 @@ async def create_responses( ) = self._maybe_get_adapters(request) model_name = self._get_model_name(request.model, lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) - + if request.tools is None: + tool_dicts = None + else: + tool_dicts = [tool.model_dump() for tool in request.tools] _, request_prompts, engine_prompts = await self._preprocess_chat( request, tokenizer, messages, + tool_dicts=tool_dicts, + tool_parser=self.tool_parser, chat_template=self.chat_template, chat_template_content_format=self.chat_template_content_format, ) @@ -272,7 +279,6 @@ async def responses_full_generator( assert final_res is not None assert len(final_res.outputs) == 1 final_output = final_res.outputs[0] - if self.reasoning_parser: try: reasoning_parser = self.reasoning_parser(tokenizer) @@ -286,14 +292,13 @@ async def responses_full_generator( else: reasoning_content = None content = final_output.text - - output = [] + reasoning_item = None + message_item = None if reasoning_content: reasoning_item = ResponseReasoningItem( text=reasoning_content, status=None, # NOTE: Only the last output item has status. ) - output.append(reasoning_item) if content: output_text = ResponseOutputText( text=content, @@ -301,14 +306,79 @@ async def responses_full_generator( type="output_text", logprobs=None, # TODO ) - message = ResponseOutputMessage( + message_item = ResponseOutputMessage( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", status="completed", type="message", ) - output.append(message) + outputs = [] + function_calls: list[FunctionCall] = [] + if not self.enable_auto_tools or not self.tool_parser: + # Tools are not enabled + if reasoning_item: + outputs.append(reasoning_item) + if message_item: + outputs.append(message_item) + elif request.tool_choice is None: + # No tool calls. + if reasoning_item: + outputs.append(reasoning_item) + if message_item: + outputs.append(message_item) + elif request.tool_choice and \ + isinstance(request.tool_choice, + ToolChoiceFunction): + # Forced Function Call + function_calls.append( + FunctionCall(name=request.tool_choice.name, arguments=content)) + elif request.tool_choice == "required": + assert content is not None + tool_calls = TypeAdapter( + list[FunctionDefinition]).validate_json(content) + function_calls.extend([ + FunctionCall(name=tool_call.name, + arguments=json.dumps(tool_call.parameters, + ensure_ascii=False)) + for tool_call in tool_calls + ]) + elif request.tool_choice == "auto" or request.tool_choice == "none": + try: + tool_parser = self.tool_parser(tokenizer) + except RuntimeError as e: + logger.exception("Error in tool parser creation.") + return self.create_error_response(str(e)) + tool_call_info = tool_parser.extract_tool_calls( + content if content is not None else "", request=request) + if tool_call_info is not None and tool_call_info.tools_called: + # extract_tool_calls() returns a list of tool calls. + function_calls.extend( + FunctionCall( + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ) for tool_call in tool_call_info.tool_calls) + else: + # No tool calls. + if reasoning_item: + outputs.append(reasoning_item) + if message_item: + outputs.append(message_item) + else: + return self.create_error_response( + f"Invalid tool_choice: {request.tool_choice}") + + if function_calls: + outputs.extend([ + ResponseFunctionToolCall( + id=f"fc_{random_fc_uuid()}", + call_id=f"call_{random_uuid()}", + type="function_call", + status="completed", + name=tool_call.name, + arguments=tool_call.arguments, + ) for tool_call in function_calls + ]) # Calculate usage. assert final_res.prompt_token_ids is not None @@ -329,7 +399,7 @@ async def responses_full_generator( sampling_params, model_name=model_name, created_time=created_time, - output=output, + output=outputs, status="completed", usage=usage, ) @@ -376,7 +446,33 @@ def _construct_input_messages( if isinstance(request.input, str): messages.append({"role": "user", "content": request.input}) else: - messages.extend(request.input) # type: ignore + for item in request.input: + if item.get("type") == "function_call": + # Append the function call as a tool call. + messages.append( + ChatCompletionAssistantMessageParam( + role="assistant", + tool_calls=[ + ChatCompletionMessageToolCallParam( + id=item.get("call_id"), + function=FunctionCallTool( + name=item.get("name"), + arguments=item.get("arguments", "{}"), + ), + type="function", + ) + ], + )) + elif item.get("type") == "function_call_output": + # Append the function call output as a tool message. + messages.append( + ChatCompletionToolMessageParam( + role="tool", + content=item.get("output", ""), + tool_call_id=item.get("call_id"), + )) + else: + messages.append(item) # type: ignore return messages async def _run_background_request( diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py index 02aeab61363..a0744152102 100644 --- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -8,7 +8,8 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, - ExtractedToolCallInformation) + ExtractedToolCallInformation, + ResponsesRequest) from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import import_from_path, is_list_of @@ -39,15 +40,17 @@ def vocab(self) -> dict[str, int]: return self.model_tokenizer.get_vocab() def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + self, request: Union[ChatCompletionRequest, ResponsesRequest] + ) -> Union[ChatCompletionRequest, ResponsesRequest]: """ Static method that used to adjust the request parameters. """ return request def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: """ Static method that should be implemented for extracting tool calls from a complete model-generated string. diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py index da4760ad1b6..8c1850ef407 100644 --- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py @@ -7,11 +7,15 @@ import regex as re from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -71,7 +75,7 @@ def __init__(self, tokenizer: AnyTokenizer): def extract_tool_calls( self, model_output: str, - request: ChatCompletionRequest, + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> ExtractedToolCallInformation: # sanity check; avoid unnecessary processing diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py index 5508ba6a394..85dc3940223 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py @@ -11,11 +11,15 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.entrypoints.openai.tool_parsers.utils import (consume_space, @@ -47,8 +51,9 @@ def __init__(self, tokenizer: AnyTokenizer): self.tool_call_regex = re.compile(r"\s*") def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: if self.tool_start_token not in model_output: return ExtractedToolCallInformation(tools_called=False, tool_calls=[], diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py index fcc5b7edda8..887401031c5 100644 --- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py @@ -9,11 +9,15 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.entrypoints.openai.tool_parsers.utils import (consume_space, @@ -45,8 +49,9 @@ def __init__(self, tokenizer: AnyTokenizer): self.bot_string = "" def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: stripped = model_output.strip()\ .removeprefix(self.bot_token)\ .removeprefix(self.bot_string)\ diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py index c7030d34d45..b3e23fa89fd 100644 --- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -10,11 +10,15 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -64,7 +68,7 @@ def __init__(self, tokenizer: AnyTokenizer): def extract_tool_calls( self, model_output: str, - request: ChatCompletionRequest, + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> ExtractedToolCallInformation: # sanity check; avoid unnecessary processing diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py index 92004de030d..701c7c78bf8 100644 --- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py @@ -9,11 +9,15 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.entrypoints.openai.tool_parsers.utils import ( @@ -32,7 +36,10 @@ def __init__(self, tokenizer: AnyTokenizer): self.position = 0 def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + self, request: Union[ChatCompletionRequest, ResponsesRequest] + ) -> Union[ChatCompletionRequest, ResponsesRequest]: + if not isinstance(request, ChatCompletionRequest): + return request if request.tools and request.tool_choice != 'none': # do not skip special tokens because internlm use the special # tokens to indicated the start and end of the tool calls @@ -183,7 +190,7 @@ def extract_tool_calls_streaming( def extract_tool_calls( self, model_output: str, - request: ChatCompletionRequest, + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> ExtractedToolCallInformation: text = model_output tools = request.tools diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py index 66b483d8b0f..9fd67445b79 100644 --- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py @@ -10,11 +10,15 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.entrypoints.openai.tool_parsers.utils import ( extract_intermediate_diff) @@ -64,7 +68,10 @@ def __init__(self, tokenizer: AnyTokenizer): "tokens in the tokenizer!") def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + self, request: Union[ChatCompletionRequest, ResponsesRequest] + ) -> Union[ChatCompletionRequest, ResponsesRequest]: + if not isinstance(request, ChatCompletionRequest): + return request if request.tools and request.tool_choice != 'none': # do not skip special tokens because jamba use the special # tokens to indicate the start and end of the tool calls @@ -73,8 +80,9 @@ def adjust_request( return request def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: # sanity check; avoid unnecessary processing if self.tool_calls_start_token not in model_output: diff --git a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py index b0df442dd86..05bf42e7013 100644 --- a/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/kimi_k2_tool_parser.py @@ -7,11 +7,15 @@ import regex as re +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -70,7 +74,7 @@ def __init__(self, tokenizer: AnyTokenizer): def extract_tool_calls( self, model_output: str, - request: ChatCompletionRequest, + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> ExtractedToolCallInformation: # sanity check; avoid unnecessary processing @@ -374,4 +378,4 @@ def extract_tool_calls_streaming( except Exception: logger.exception("Error trying to handle streaming tool call.") - return None # do not stream a delta. skip this token ID. \ No newline at end of file + return None # do not stream a delta. skip this token ID. diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py index 6bf44a4345a..e0db7090081 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py @@ -9,11 +9,15 @@ from transformers import PreTrainedTokenizerBase import vllm.envs as envs +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -55,8 +59,9 @@ def current_tool_index(self, value: int) -> None: self.current_tool_id = value def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. """ diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py index 5698bc70af2..115424c5b4f 100644 --- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -12,11 +12,15 @@ from transformers import PreTrainedTokenizerBase from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix, @@ -54,8 +58,9 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase): self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. """ diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py index 6ba32e38fcd..0a5ca0f4a89 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py @@ -10,11 +10,15 @@ from partial_json_parser.core.options import Allow from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -83,7 +87,7 @@ def remove_tool_calls_from_think(match): def extract_tool_calls( self, model_output: str, - request: ChatCompletionRequest, + request: Union[ChatCompletionRequest, ResponsesRequest], ) -> ExtractedToolCallInformation: # Preprocess to remove tool calls from thinking tags diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index c0691f12290..706a352951d 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -12,11 +12,15 @@ from partial_json_parser.core.options import Allow from pydantic import Field +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.entrypoints.openai.tool_parsers.utils import ( @@ -88,7 +92,10 @@ def __init__(self, tokenizer: AnyTokenizer): "the tokenizer!") def adjust_request( - self, request: ChatCompletionRequest) -> ChatCompletionRequest: + self, request: Union[ChatCompletionRequest, ResponsesRequest] + ) -> Union[ChatCompletionRequest, ResponsesRequest]: + if not isinstance(request, ChatCompletionRequest): + return request if not isinstance( self.model_tokenizer, MistralTokenizer ) and request.tools and request.tool_choice != 'none': @@ -101,9 +108,8 @@ def adjust_request( return request def extract_tool_calls( - self, - model_output: str, - request: ChatCompletionRequest, + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. Requires diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py index 5501028cf36..6159c89f43a 100644 --- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py @@ -3,16 +3,20 @@ import json from collections.abc import Sequence -from typing import Any, Optional +from typing import Any, Optional, Union import regex as re from transformers import PreTrainedTokenizerBase from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaMessage, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -43,8 +47,9 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None: self.bot_token: str = "functools" def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. """ diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py index 73329cdf701..041bd6efcc9 100644 --- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py @@ -10,11 +10,15 @@ from transformers import PreTrainedTokenizerBase import vllm.envs as envs +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -58,8 +62,9 @@ def current_tool_index(self, value: int) -> None: self.current_tool_id = value def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: """ Extract the tool calls from a complete model response. """ diff --git a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py index 321718b1c95..a2b2d456ed8 100644 --- a/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/xlam_tool_parser.py @@ -8,11 +8,15 @@ import regex as re from vllm.entrypoints.chat_utils import random_tool_call_id +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ExtractedToolCallInformation, - FunctionCall, ToolCall) + FunctionCall, ResponsesRequest, + ToolCall) +# yapf: enable from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( ToolParser, ToolParserManager) from vllm.logger import init_logger @@ -115,8 +119,9 @@ def preprocess_model_output( return model_output, None def extract_tool_calls( - self, model_output: str, - request: ChatCompletionRequest) -> ExtractedToolCallInformation: + self, model_output: str, request: Union[ChatCompletionRequest, + ResponsesRequest] + ) -> ExtractedToolCallInformation: """ Extract tool calls from a complete model output. """ diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py index c18f1d12ba9..d09edfb5efe 100644 --- a/vllm/utils/__init__.py +++ b/vllm/utils/__init__.py @@ -510,6 +510,11 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) +def random_fc_uuid() -> str: + """Generates a random UUID for function call tool outputs.""" + return str(os.urandom(24).hex()) + + class AsyncMicrobatchTokenizer: """Asynchronous tokenizer with micro-batching.