[Inference Providers] Fix structured output schema in chat completion (#3082)

hanouticelina · web-flow · commit 417ad8974321 · 2025-05-22T12:16:45.000+02:00
* fix structured output

* fix

* style

* run style again

* fix tests

* rename types

* review suggestions

* no need to mutate parameters

* docs

* better
diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
@@ -308,6 +308,110 @@ You might wonder why using [`InferenceClient`] instead of OpenAI's client? There
 
 </Tip>
 
+## Function Calling
+
+Function calling allows LLMs to interact with external tools, such as defined functions or APIs. This enables users to easily build applications tailored to specific use cases and real-world tasks. 
+`InferenceClient` implements the same tool calling interface as the OpenAI Chat Completions API. Here is a simple example of tool calling using [Nebius](https://nebius.com/) as the inference provider:
+
+```python
+from huggingface_hub import InferenceClient
+
+tools = [
+        {
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Get current temperature for a given location.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "location": {
+                            "type": "string",
+                            "description": "City and country e.g. Paris, France"
+                        }
+                    },
+                    "required": ["location"],
+                },
+            }
+        }
+]
+
+client = InferenceClient(provider="nebius")
+
+response = client.chat.completions.create(
+    model="Qwen/Qwen2.5-72B-Instruct",
+    messages=[
+    {
+        "role": "user",
+        "content": "What's the weather like the next 3 days in London, UK?"
+    }
+    ],
+    tools=tools,
+    tool_choice="auto",
+)
+
+print(response.choices[0].message.tool_calls[0].function.arguments)
+
+```
+
+<Tip>
+
+Please refer to the providers' documentation to verify which models are supported by them for Function/Tool Calling.
+
+</Tip>
+
+## Structured Outputs & JSON Mode
+
+InferenceClient supports JSON mode for syntactically valid JSON responses and Structured Outputs for schema-enforced responses. JSON mode provides machine-readable data without strict structure, while Structured Outputs guarantee both valid JSON and adherence to a predefined schema for reliable downstream processing.
+
+We follow the OpenAI API specs for both JSON mode and Structured Outputs. You can enable them via the `response_format` argument. Here is an example of Structured Outputs using [Cerebras](https://www.cerebras.ai/) as the inference provider: 
+
+```python
+from huggingface_hub import InferenceClient
+
+json_schema = {
+    "name": "book",
+    "schema": {
+        "properties": {
+            "name": {
+                "title": "Name",
+                "type": "string",
+            },
+            "authors": {
+                "items": {"type": "string"},
+                "title": "Authors",
+                "type": "array",
+            },
+        },
+        "required": ["name", "authors"],
+        "title": "Book",
+        "type": "object",
+    },
+    "strict": True,
+}
+
+client = InferenceClient(provider="cerebras")
+
+
+completion = client.chat.completions.create(
+    model="Qwen/Qwen3-32B",
+    messages=[
+        {"role": "system", "content": "Extract the books information."},
+        {"role": "user", "content": "I recently read 'The Great Gatsby' by F. Scott Fitzgerald."},
+    ],
+    response_format={
+        "type": "json_schema",
+        "json_schema": json_schema,
+    },
+)
+
+print(completion.choices[0].message)
+```
+<Tip>
+
+Please refer to the providers' documentation to verify which models are supported by them for Structured Outputs and JSON Mode.
+
+</Tip>
 
 ## Async client
 
diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
@@ -57,12 +57,18 @@ This part of the lib is still under development and will be improved in future r
 
 [[autodoc]] huggingface_hub.ChatCompletionInputFunctionName
 
-[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType
+[[autodoc]] huggingface_hub.ChatCompletionInputJSONSchema
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessage
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONObject
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONSchema
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatText
+
 [[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
 
 [[autodoc]] huggingface_hub.ChatCompletionInputTool
diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
@@ -56,12 +56,18 @@ rendered properly in your Markdown viewer.
 
 [[autodoc]] huggingface_hub.ChatCompletionInputFunctionName
 
-[[autodoc]] huggingface_hub.ChatCompletionInputGrammarType
+[[autodoc]] huggingface_hub.ChatCompletionInputJSONSchema
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessage
 
 [[autodoc]] huggingface_hub.ChatCompletionInputMessageChunk
 
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONObject
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatJSONSchema
+
+[[autodoc]] huggingface_hub.ChatCompletionInputResponseFormatText
+
 [[autodoc]] huggingface_hub.ChatCompletionInputStreamOptions
 
 [[autodoc]] huggingface_hub.ChatCompletionInputTool
diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -301,10 +301,13 @@
         "ChatCompletionInputFunctionDefinition",
         "ChatCompletionInputFunctionName",
         "ChatCompletionInputGrammarType",
-        "ChatCompletionInputGrammarTypeType",
+        "ChatCompletionInputJSONSchema",
         "ChatCompletionInputMessage",
         "ChatCompletionInputMessageChunk",
         "ChatCompletionInputMessageChunkType",
+        "ChatCompletionInputResponseFormatJSONObject",
+        "ChatCompletionInputResponseFormatJSONSchema",
+        "ChatCompletionInputResponseFormatText",
         "ChatCompletionInputStreamOptions",
         "ChatCompletionInputTool",
         "ChatCompletionInputToolCall",
@@ -545,10 +548,13 @@
     "ChatCompletionInputFunctionDefinition",
     "ChatCompletionInputFunctionName",
     "ChatCompletionInputGrammarType",
-    "ChatCompletionInputGrammarTypeType",
+    "ChatCompletionInputJSONSchema",
     "ChatCompletionInputMessage",
     "ChatCompletionInputMessageChunk",
     "ChatCompletionInputMessageChunkType",
+    "ChatCompletionInputResponseFormatJSONObject",
+    "ChatCompletionInputResponseFormatJSONSchema",
+    "ChatCompletionInputResponseFormatText",
     "ChatCompletionInputStreamOptions",
     "ChatCompletionInputTool",
     "ChatCompletionInputToolCall",
@@ -1267,10 +1273,13 @@ def __dir__():
         ChatCompletionInputFunctionDefinition,  # noqa: F401
         ChatCompletionInputFunctionName,  # noqa: F401
         ChatCompletionInputGrammarType,  # noqa: F401
-        ChatCompletionInputGrammarTypeType,  # noqa: F401
+        ChatCompletionInputJSONSchema,  # noqa: F401
         ChatCompletionInputMessage,  # noqa: F401
         ChatCompletionInputMessageChunk,  # noqa: F401
         ChatCompletionInputMessageChunkType,  # noqa: F401
+        ChatCompletionInputResponseFormatJSONObject,  # noqa: F401
+        ChatCompletionInputResponseFormatJSONSchema,  # noqa: F401
+        ChatCompletionInputResponseFormatText,  # noqa: F401
         ChatCompletionInputStreamOptions,  # noqa: F401
         ChatCompletionInputTool,  # noqa: F401
         ChatCompletionInputToolCall,  # noqa: F401
diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -24,10 +24,13 @@
     ChatCompletionInputFunctionDefinition,
     ChatCompletionInputFunctionName,
     ChatCompletionInputGrammarType,
-    ChatCompletionInputGrammarTypeType,
+    ChatCompletionInputJSONSchema,
     ChatCompletionInputMessage,
     ChatCompletionInputMessageChunk,
     ChatCompletionInputMessageChunkType,
+    ChatCompletionInputResponseFormatJSONObject,
+    ChatCompletionInputResponseFormatJSONSchema,
+    ChatCompletionInputResponseFormatText,
     ChatCompletionInputStreamOptions,
     ChatCompletionInputTool,
     ChatCompletionInputToolCall,
diff --git a/src/huggingface_hub/inference/_generated/types/chat_completion.py b/src/huggingface_hub/inference/_generated/types/chat_completion.py
@@ -3,7 +3,7 @@
 # See:
 #   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
 #   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
-from typing import Any, List, Literal, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 from .base import BaseInferenceType, dataclass_with_extra
 
@@ -45,17 +45,51 @@ class ChatCompletionInputMessage(BaseInferenceType):
     tool_calls: Optional[List[ChatCompletionInputToolCall]] = None
 
 
-ChatCompletionInputGrammarTypeType = Literal["json", "regex", "json_schema"]
+@dataclass_with_extra
+class ChatCompletionInputJSONSchema(BaseInferenceType):
+    name: str
+    """
+    The name of the response format.
+    """
+    description: Optional[str] = None
+    """
+    A description of what the response format is for, used by the model to determine
+    how to respond in the format.
+    """
+    schema: Optional[Dict[str, object]] = None
+    """
+    The schema for the response format, described as a JSON Schema object. Learn how
+    to build JSON schemas [here](https://json-schema.org/).
+    """
+    strict: Optional[bool] = None
+    """
+    Whether to enable strict schema adherence when generating the output. If set to
+    true, the model will always follow the exact schema defined in the `schema`
+    field.
+    """
 
 
 @dataclass_with_extra
-class ChatCompletionInputGrammarType(BaseInferenceType):
-    type: "ChatCompletionInputGrammarTypeType"
-    value: Any
-    """A string that represents a [JSON Schema](https://json-schema.org/).
-    JSON Schema is a declarative language that allows to annotate JSON documents
-    with types and descriptions.
-    """
+class ChatCompletionInputResponseFormatText(BaseInferenceType):
+    type: Literal["text"]
+
+
+@dataclass_with_extra
+class ChatCompletionInputResponseFormatJSONSchema(BaseInferenceType):
+    type: Literal["json_schema"]
+    json_schema: ChatCompletionInputJSONSchema
+
+
+@dataclass_with_extra
+class ChatCompletionInputResponseFormatJSONObject(BaseInferenceType):
+    type: Literal["json_object"]
+
+
+ChatCompletionInputGrammarType = Union[
+    ChatCompletionInputResponseFormatText,
+    ChatCompletionInputResponseFormatJSONSchema,
+    ChatCompletionInputResponseFormatJSONObject,
+]
 
 
 @dataclass_with_extra
diff --git a/src/huggingface_hub/inference/_providers/cerebras.py b/src/huggingface_hub/inference/_providers/cerebras.py
@@ -1,4 +1,4 @@
-from huggingface_hub.inference._providers._common import BaseConversationalTask
+from ._common import BaseConversationalTask
 
 
 class CerebrasConversationalTask(BaseConversationalTask):
diff --git a/src/huggingface_hub/inference/_providers/cohere.py b/src/huggingface_hub/inference/_providers/cohere.py
@@ -1,6 +1,8 @@
-from huggingface_hub.inference._providers._common import (
-    BaseConversationalTask,
-)
+from typing import Any, Dict, Optional
+
+from huggingface_hub.hf_api import InferenceProviderMapping
+
+from ._common import BaseConversationalTask
 
 
 _PROVIDER = "cohere"
@@ -13,3 +15,18 @@ def __init__(self):
 
     def _prepare_route(self, mapped_model: str, api_key: str) -> str:
         return "/compatibility/v1/chat/completions"
+
+    def _prepare_payload_as_dict(
+        self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+    ) -> Optional[Dict]:
+        payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+        response_format = parameters.get("response_format")
+        if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+            json_schema_details = response_format.get("json_schema")
+            if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+                payload["response_format"] = {  # type: ignore [index]
+                    "type": "json_object",
+                    "schema": json_schema_details["schema"],
+                }
+
+        return payload
diff --git a/src/huggingface_hub/inference/_providers/fireworks_ai.py b/src/huggingface_hub/inference/_providers/fireworks_ai.py
@@ -1,3 +1,7 @@
+from typing import Any, Dict, Optional
+
+from huggingface_hub.hf_api import InferenceProviderMapping
+
 from ._common import BaseConversationalTask
 
 
@@ -7,3 +11,17 @@ def __init__(self):
 
     def _prepare_route(self, mapped_model: str, api_key: str) -> str:
         return "/inference/v1/chat/completions"
+
+    def _prepare_payload_as_dict(
+        self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+    ) -> Optional[Dict]:
+        payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+        response_format = parameters.get("response_format")
+        if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+            json_schema_details = response_format.get("json_schema")
+            if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+                payload["response_format"] = {  # type: ignore [index]
+                    "type": "json_object",
+                    "schema": json_schema_details["schema"],
+                }
+        return payload
diff --git a/src/huggingface_hub/inference/_providers/hf_inference.py b/src/huggingface_hub/inference/_providers/hf_inference.py
@@ -96,13 +96,20 @@ def __init__(self):
     def _prepare_payload_as_dict(
         self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
     ) -> Optional[Dict]:
+        payload = filter_none(parameters)
         mapped_model = provider_mapping_info.provider_id
         payload_model = parameters.get("model") or mapped_model
 
         if payload_model is None or payload_model.startswith(("http://", "https://")):
             payload_model = "dummy"
 
-        return {**filter_none(parameters), "model": payload_model, "messages": inputs}
+        response_format = parameters.get("response_format")
+        if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+            payload["response_format"] = {
+                "type": "json_object",
+                "value": response_format["json_schema"]["schema"],
+            }
+        return {**payload, "model": payload_model, "messages": inputs}
 
     def _prepare_url(self, api_key: str, mapped_model: str) -> str:
         base_url = (
diff --git a/src/huggingface_hub/inference/_providers/nebius.py b/src/huggingface_hub/inference/_providers/nebius.py
@@ -30,6 +30,17 @@ class NebiusConversationalTask(BaseConversationalTask):
     def __init__(self):
         super().__init__(provider="nebius", base_url="https://api.studio.nebius.ai")
 
+    def _prepare_payload_as_dict(
+        self, inputs: Any, parameters: Dict, provider_mapping_info: InferenceProviderMapping
+    ) -> Optional[Dict]:
+        payload = super()._prepare_payload_as_dict(inputs, parameters, provider_mapping_info)
+        response_format = parameters.get("response_format")
+        if isinstance(response_format, dict) and response_format.get("type") == "json_schema":
+            json_schema_details = response_format.get("json_schema")
+            if isinstance(json_schema_details, dict) and "schema" in json_schema_details:
+                payload["guided_json"] = json_schema_details["schema"]  # type: ignore [index]
+        return payload
+
 
 class NebiusTextToImageTask(TaskProviderHelper):
     def __init__(self):
diff --git a/src/huggingface_hub/inference/_providers/sambanova.py b/src/huggingface_hub/inference/_providers/sambanova.py
diff --git a/src/huggingface_hub/inference/_providers/together.py b/src/huggingface_hub/inference/_providers/together.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from huggingface_hub.inference._providers._common import BaseConversationalTask`
	`1`	`+from ._common import BaseConversationalTask`
`2`	`2`
`3`	`3`
`4`	`4`	`class CerebrasConversationalTask(BaseConversationalTask):`