diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
index d59138ab0d..1817aa9460 100644
--- a/docs/source/en/guides/inference.md
+++ b/docs/source/en/guides/inference.md
@@ -308,6 +308,110 @@ You might wonder why using [`InferenceClient`] instead of OpenAI's client? There
+## Function Calling
+
+Function calling allows LLMs to interact with external tools, such as defined functions or APIs. This enables users to easily build applications tailored to specific use cases and real-world tasks.
+`InferenceClient` implements the same tool calling interface as the OpenAI Chat Completions API. Here is a simple example of tool calling using [Nebius](https://nebius.com/) as the inference provider:
+
+```python
+from huggingface_hub import InferenceClient
+
+tools = [
+ {
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "description": "Get current temperature for a given location.",
+ "parameters": {
+ "type": "object",
+ "properties": {
+ "location": {
+ "type": "string",
+ "description": "City and country e.g. Paris, France"
+ }
+ },
+ "required": ["location"],
+ },
+ }
+ }
+]
+
+client = InferenceClient(provider="nebius")
+
+response = client.chat.completions.create(
+ model="Qwen/Qwen2.5-72B-Instruct",
+ messages=[
+ {
+ "role": "user",
+ "content": "What's the weather like the next 3 days in London, UK?"
+ }
+ ],
+ tools=tools,
+ tool_choice="auto",
+)
+
+print(response.choices[0].message.tool_calls[0].function.arguments)
+
+```
+
+
+
+Please refer to the providers' documentation to verify which models are supported by them for Function/Tool Calling.
+
+
+
+## Structured Outputs & JSON Mode
+
+InferenceClient supports JSON mode for syntactically valid JSON responses and Structured Outputs for schema-enforced responses. JSON mode provides machine-readable data without strict structure, while Structured Outputs guarantee both valid JSON and adherence to a predefined schema for reliable downstream processing.
+
+We follow the OpenAI API specs for both JSON mode and Structured Outputs. You can enable them via the `response_format` argument. Here is an example of Structured Outputs using [Cerebras](https://www.cerebras.ai/) as the inference provider:
+
+```python
+from huggingface_hub import InferenceClient
+
+json_schema = {
+ "name": "book",
+ "schema": {
+ "properties": {
+ "name": {
+ "title": "Name",
+ "type": "string",
+ },
+ "authors": {
+ "items": {"type": "string"},
+ "title": "Authors",
+ "type": "array",
+ },
+ },
+ "required": ["name", "authors"],
+ "title": "Book",
+ "type": "object",
+ },
+ "strict": True,
+}
+
+client = InferenceClient(provider="cerebras")
+
+
+completion = client.chat.completions.create(
+ model="Qwen/Qwen3-32B",
+ messages=[
+ {"role": "system", "content": "Extract the books information."},
+ {"role": "user", "content": "I recently read 'The Great Gatsby' by F. Scott Fitzgerald."},
+ ],
+ response_format={
+ "type": "json_schema",
+ "json_schema": json_schema,
+ },
+)
+
+print(completion.choices[0].message)
+```
+
+
+Please refer to the providers' documentation to verify which models are supported by them for Structured Outputs and JSON Mode.
+
+
## Async client
diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
index 9bd2528fe5..1c90e9facb 100644
--- a/docs/source/en/package_reference/inference_types.md
+++ b/docs/source/en/package_reference/inference_types.md
@@ -57,12 +57,18 @@ This part of the lib is still under development and will be improved in future r
[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName
-[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType
+[[autodoc]] huggingface_hub.ChatCompletionInputJSONSchema
[[autodoc]] huggingface_hub.ChatCompletionInputMessage
[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONObject
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONSchema
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatText
+
[[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
[[autodoc]] huggingface_hub.ChatCompletionInputTool
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
index bf9de4b8e7..3746086ed2 100644
--- a/docs/source/ko/package_reference/inference_types.md
+++ b/docs/source/ko/package_reference/inference_types.md
@@ -56,12 +56,18 @@ rendered properly in your Markdown viewer.
[[autodoc]] huggingface_hub.ChatCompletionInputFunctionName
-[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType
+[[autodoc]] huggingface_hub.ChatCompletionInputJSONSchema
[[autodoc]] huggingface_hub.ChatCompletionInputMessage
[[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONObject
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONSchema
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatText
+
[[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
[[autodoc]] huggingface_hub.ChatCompletionInputTool
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
index 8e1a99787f..fe29e43e19 100644
--- a/src/huggingface_hub/__init__.py
+++ b/src/huggingface_hub/__init__.py
@@ -301,10 +301,13 @@
"ChatCompletionInputFunctionDefinition",
"ChatCompletionInputFunctionName",
"ChatCompletionInputGrammarType",
- "ChatCompletionInputGrammarTypeType",
+ "ChatCompletionInputJSONSchema",
"ChatCompletionInputMessage",
"ChatCompletionInputMessageChunk",
"ChatCompletionInputMessageChunkType",
+ "ChatCompletionInputResponseFormatJSONObject",
+ "ChatCompletionInputResponseFormatJSONSchema",
+ "ChatCompletionInputResponseFormatText",
"ChatCompletionInputStreamOptions",
"ChatCompletionInputTool",
"ChatCompletionInputToolCall",
@@ -545,10 +548,13 @@
"ChatCompletionInputFunctionDefinition",
"ChatCompletionInputFunctionName",
"ChatCompletionInputGrammarType",
- "ChatCompletionInputGrammarTypeType",
+ "ChatCompletionInputJSONSchema",
"ChatCompletionInputMessage",
"ChatCompletionInputMessageChunk",
"ChatCompletionInputMessageChunkType",
+ "ChatCompletionInputResponseFormatJSONObject",
+ "ChatCompletionInputResponseFormatJSONSchema",
+ "ChatCompletionInputResponseFormatText",
"ChatCompletionInputStreamOptions",
"ChatCompletionInputTool",
"ChatCompletionInputToolCall",
@@ -1267,10 +1273,13 @@ def __dir__():
ChatCompletionInputFunctionDefinition, # noqa: F401
ChatCompletionInputFunctionName, # noqa: F401
ChatCompletionInputGrammarType, # noqa: F401
- ChatCompletionInputGrammarTypeType, # noqa: F401
+ ChatCompletionInputJSONSchema, # noqa: F401
ChatCompletionInputMessage, # noqa: F401
ChatCompletionInputMessageChunk, # noqa: F401
ChatCompletionInputMessageChunkType, # noqa: F401
+ ChatCompletionInputResponseFormatJSONObject, # noqa: F401
+ ChatCompletionInputResponseFormatJSONSchema, # noqa: F401
+ ChatCompletionInputResponseFormatText, # noqa: F401
ChatCompletionInputStreamOptions, # noqa: F401
ChatCompletionInputTool, # noqa: F401
ChatCompletionInputToolCall, # noqa: F401
diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
index 92f286792b..63f6a653d6 100644
--- a/src/huggingface_hub/inference/_generated/types/__init__.py
+++ b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -24,10 +24,13 @@
ChatCompletionInputFunctionDefinition,
ChatCompletionInputFunctionName,
ChatCompletionInputGrammarType,
- ChatCompletionInputGrammarTypeType,
+ ChatCompletionInputJSONSchema,
ChatCompletionInputMessage,
ChatCompletionInputMessageChunk,
ChatCompletionInputMessageChunkType,
+ ChatCompletionInputResponseFormatJSONObject,
+ ChatCompletionInputResponseFormatJSONSchema,
+ ChatCompletionInputResponseFormatText,
ChatCompletionInputStreamOptions,
ChatCompletionInputTool,
ChatCompletionInputToolCall,
diff --git a/src/huggingface_hub/inference/_generated/types/chat_completion.py b/src/huggingface_hub/inference/_generated/types/chat_completion.py
index 9978c0a5a9..fe455ee710 100644
--- a/src/huggingface_hub/inference/_generated/types/chat_completion.py
+++ b/src/huggingface_hub/inference/_generated/types/chat_completion.py
@@ -3,7 +3,7 @@
# See:
# - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
# - specs: https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
from .base import BaseInferenceType, dataclass_with_extra
@@ -45,17 +45,51 @@ class ChatCompletionInputMessage(BaseInferenceType):
tool_calls: Optional[List[ChatCompletionInputToolCall]] = None
-ChatCompletionInputGrammarTypeType = Literal["json", "regex", "json_schema"]
+@dataclass_with_extra
+class ChatCompletionInputJSONSchema(BaseInferenceType):
+ name: str
+ """
+ The name of the response format.
+ """
+ description: Optional[str] = None
+ """
+ A description of what the response format is for, used by the model to determine
+ how to respond in the format.
+ """
+ schema: Optional[Dict[str, object]] = None
+ """
+ The schema for the response format, described as a JSON Schema object. Learn how
+ to build JSON schemas [here](https://json-schema.org/).
+ """
+ strict: Optional[bool] = None
+ """
+ Whether to enable strict schema adherence when generating the output. If set to
+ true, the model will always follow the exact schema defined in the `schema`
+ field.
+ """
@dataclass_with_extra
-class ChatCompletionInputGrammarType(BaseInferenceType):
- type: "ChatCompletionInputGrammarTypeType"
- value: Any
- """A string that represents a [JSON Schema](https://json-schema.org/).
- JSON Schema is a declarative language that allows to annotate JSON documents
- with types and descriptions.
- """
+class ChatCompletionInputResponseFormatText(BaseInferenceType):
+ type: Literal["text"]
+
+
+@dataclass_with_extra
+class ChatCompletionInputResponseFormatJSONSchema(BaseInferenceType):
+ type: Literal["json_schema"]
+ json_schema: ChatCompletionInputJSONSchema
+
+
+@dataclass_with_extra
+class ChatCompletionInputResponseFormatJSONObject(BaseInferenceType):
+ type: Literal["json_object"]
+
+
+ChatCompletionInputGrammarType = Union[
+ ChatCompletionInputResponseFormatText,
+ ChatCompletionInputResponseFormatJSONSchema,
+ ChatCompletionInputResponseFormatJSONObject,
+]
@dataclass_with_extra
diff --git a/src/huggingface_hub/inference/_providers/cerebras.py b/src/huggingface_hub/inference/_providers/cerebras.py
index 12b1815832..a9b9c3aacb 100644
--- a/src/huggingface_hub/inference/_providers/cerebras.py
+++ b/src/huggingface_hub/inference/_providers/cerebras.py
@@ -1,4 +1,4 @@
-from huggingface_hub.inference._providers._common import BaseConversationalTask
+from ._common import BaseConversationalTask
class CerebrasConversationalTask(BaseConversationalTask):
diff --git a/src/huggingface_hub/inference/_providers/cohere.py b/src/huggingface_hub/inference/_providers/cohere.py
index 0dc35c7e6c..a5e9191cae 100644
--- a/src/huggingface_hub/inference/_providers/cohere.py
+++ b/src/huggingface_hub/inference/_providers/cohere.py
@@ -1,6 +1,8 @@
-from huggingface_hub.inference._providers._common import (
- BaseConversationalTask,
-)
+from typing import Any, Dict, Optional
+
+from huggingface_hub.hf_api import InferenceProviderMapping
+
+from ._common import BaseConversationalTask
_PROVIDER = "cohere"
@@ -13,3 +15,18 @@ def __init__(self):
def _prepare_route(self, mapped_model: str, api_key: str) -> str:
return "/compatibility/v1/chat/completions"
+
+ def _prepare_payload_as_dict(
+ self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+ ) -> Optional[Dict]:
+ payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+ response_format = parameters.get("response_format")
+ if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+ json_schema_details = response_format.get("json_schema")
+ if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+ payload["response_format"] = { # type: ignore [index]
+ "type": "json_object",
+ "schema": json_schema_details["schema"],
+ }
+
+ return payload
diff --git a/src/huggingface_hub/inference/_providers/fireworks_ai.py b/src/huggingface_hub/inference/_providers/fireworks_ai.py
index 9fc9aba806..b4cc19a570 100644
--- a/src/huggingface_hub/inference/_providers/fireworks_ai.py
+++ b/src/huggingface_hub/inference/_providers/fireworks_ai.py
@@ -1,3 +1,7 @@
+from typing import Any, Dict, Optional
+
+from huggingface_hub.hf_api import InferenceProviderMapping
+
from ._common import BaseConversationalTask
@@ -7,3 +11,17 @@ def __init__(self):
def _prepare_route(self, mapped_model: str, api_key: str) -> str:
return "/inference/v1/chat/completions"
+
+ def _prepare_payload_as_dict(
+ self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+ ) -> Optional[Dict]:
+ payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+ response_format = parameters.get("response_format")
+ if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+ json_schema_details = response_format.get("json_schema")
+ if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+ payload["response_format"] = { # type: ignore [index]
+ "type": "json_object",
+ "schema": json_schema_details["schema"],
+ }
+ return payload
diff --git a/src/huggingface_hub/inference/_providers/hf_inference.py b/src/huggingface_hub/inference/_providers/hf_inference.py
index 7923567be3..0b2cf1e7a3 100644
--- a/src/huggingface_hub/inference/_providers/hf_inference.py
+++ b/src/huggingface_hub/inference/_providers/hf_inference.py
@@ -96,13 +96,20 @@ def __init__(self):
def _prepare_payload_as_dict(
self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
) -> Optional[Dict]:
+ payload = filter_none(parameters)
mapped_model = provider_mapping_info.provider_id
payload_model = parameters.get("model") or mapped_model
if payload_model is None or payload_model.startswith(("http://", "https://")):
payload_model = "dummy"
- return {**filter_none(parameters), "model": payload_model, "messages": inputs}
+ response_format = parameters.get("response_format")
+ if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+ payload["response_format"] = {
+ "type": "json_object",
+ "value": response_format["json_schema"]["schema"],
+ }
+ return {**payload, "model": payload_model, "messages": inputs}
def _prepare_url(self, api_key: str, mapped_model: str) -> str:
base_url = (
diff --git a/src/huggingface_hub/inference/_providers/nebius.py b/src/huggingface_hub/inference/_providers/nebius.py
index eccc9ee83b..85ad67c4c8 100644
--- a/src/huggingface_hub/inference/_providers/nebius.py
+++ b/src/huggingface_hub/inference/_providers/nebius.py
@@ -30,6 +30,17 @@ class NebiusConversationalTask(BaseConversationalTask):
def __init__(self):
super().__init__(provider="nebius", base_url="https://api.studio.nebius.ai")
+ def _prepare_payload_as_dict(
+ self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+ ) -> Optional[Dict]:
+ payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+ response_format = parameters.get("response_format")
+ if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+ json_schema_details = response_format.get("json_schema")
+ if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+ payload["guided_json"] = json_schema_details["schema"] # type: ignore [index]
+ return payload
+
class NebiusTextToImageTask(TaskProviderHelper):
def __init__(self):
diff --git a/src/huggingface_hub/inference/_providers/sambanova.py b/src/huggingface_hub/inference/_providers/sambanova.py
index 92bc95daa4..ed96fb766c 100644
--- a/src/huggingface_hub/inference/_providers/sambanova.py
+++ b/src/huggingface_hub/inference/_providers/sambanova.py
@@ -9,6 +9,20 @@ class SambanovaConversationalTask(BaseConversationalTask):
def __init__(self):
super().__init__(provider="sambanova", base_url="https://api.sambanova.ai")
+ def _prepare_payload_as_dict(
+ self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+ ) -> Optional[Dict]:
+ response_format_config = parameters.get("response_format")
+ if isinstance(response_format_config, dict):
+ if response_format_config.get("type") == "json_schema":
+ json_schema_config = response_format_config.get("json_schema", {})
+ strict = json_schema_config.get("strict")
+ if isinstance(json_schema_config, dict) and (strict is True or strict is None):
+ json_schema_config["strict"] = False
+
+ payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+ return payload
+
class SambanovaFeatureExtractionTask(TaskProviderHelper):
def __init__(self):
diff --git a/src/huggingface_hub/inference/_providers/together.py b/src/huggingface_hub/inference/_providers/together.py
index b27e332938..de166b7baf 100644
--- a/src/huggingface_hub/inference/_providers/together.py
+++ b/src/huggingface_hub/inference/_providers/together.py
@@ -51,6 +51,21 @@ class TogetherConversationalTask(BaseConversationalTask):
def __init__(self):
super().__init__(provider=_PROVIDER, base_url=_BASE_URL)
+ def _prepare_payload_as_dict(
+ self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+ ) -> Optional[Dict]:
+ payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+ response_format = parameters.get("response_format")
+ if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+ json_schema_details = response_format.get("json_schema")
+ if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+ payload["response_format"] = { # type: ignore [index]
+ "type": "json_object",
+ "schema": json_schema_details["schema"],
+ }
+
+ return payload
+
class TogetherTextToImageTask(TogetherTask):
def __init__(self):