pydantic
diff --git a/‎docs/agents.md
Lines changed: 28 additions & 0 deletions b/‎docs/agents.md
Lines changed: 28 additions & 0 deletions
diff --git a/‎docs/message-history.md
Lines changed: 45 additions & 3 deletions b/‎docs/message-history.md
Lines changed: 45 additions & 3 deletions
diff --git a/‎docs/testing.md
Lines changed: 15 additions & 0 deletions b/‎docs/testing.md
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/tools.md
Lines changed: 21 additions & 0 deletions b/‎docs/tools.md
Lines changed: 21 additions & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py
Lines changed: 6 additions & 8 deletions b/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py
Lines changed: 6 additions & 8 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/agent.py
Lines changed: 21 additions & 0 deletions b/‎pydantic_ai_slim/pydantic_ai/agent.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/messages.py
Lines changed: 7 additions & 0 deletions b/‎pydantic_ai_slim/pydantic_ai/messages.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/models/__init__.py
Lines changed: 5 additions & 2 deletions b/‎pydantic_ai_slim/pydantic_ai/models/__init__.py
Lines changed: 5 additions & 2 deletions
@@ -145,6 +145,13 @@ async def main():
         CallToolsNode(
             model_response=ModelResponse(
                 parts=[TextPart(content='Paris', part_kind='text')],
+                usage=Usage(
+                    requests=1,
+                    request_tokens=56,
+                    response_tokens=1,
+                    total_tokens=57,
+                    details=None,
+                ),
                 model_name='gpt-4o',
                 timestamp=datetime.datetime(...),
                 kind='response',
@@ -209,6 +216,13 @@ async def main():
             CallToolsNode(
                 model_response=ModelResponse(
                     parts=[TextPart(content='Paris', part_kind='text')],
+                    usage=Usage(
+                        requests=1,
+                        request_tokens=56,
+                        response_tokens=1,
+                        total_tokens=57,
+                        details=None,
+                    ),
                     model_name='gpt-4o',
                     timestamp=datetime.datetime(...),
                     kind='response',
@@ -805,6 +819,13 @@ with capture_run_messages() as messages:  # (2)!
                         part_kind='tool-call',
                     )
                 ],
+                usage=Usage(
+                    requests=1,
+                    request_tokens=62,
+                    response_tokens=4,
+                    total_tokens=66,
+                    details=None,
+                ),
                 model_name='gpt-4o',
                 timestamp=datetime.datetime(...),
                 kind='response',
@@ -831,6 +852,13 @@ with capture_run_messages() as messages:  # (2)!
                         part_kind='tool-call',
                     )
                 ],
+                usage=Usage(
+                    requests=1,
+                    request_tokens=72,
+                    response_tokens=8,
+                    total_tokens=80,
+                    details=None,
+                ),
                 model_name='gpt-4o',
                 timestamp=datetime.datetime(...),
                 kind='response',
 
@@ -27,7 +27,7 @@ and [`StreamedRunResult`][pydantic_ai.result.StreamedRunResult] (returned by [`A
 
 Example of accessing methods on a [`RunResult`][pydantic_ai.agent.AgentRunResult] :
 
-```python {title="run_result_messages.py" hl_lines="10 28"}
+```python {title="run_result_messages.py" hl_lines="10"}
 from pydantic_ai import Agent
 
 agent = Agent('openai:gpt-4o', system_prompt='Be a helpful assistant.')
@@ -64,6 +64,13 @@ print(result.all_messages())
                 part_kind='text',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=60,
+            response_tokens=12,
+            total_tokens=72,
+            details=None,
+        ),
         model_name='gpt-4o',
         timestamp=datetime.datetime(...),
         kind='response',
@@ -75,7 +82,7 @@ _(This example is complete, it can be run "as is")_
 
 Example of accessing methods on a [`StreamedRunResult`][pydantic_ai.result.StreamedRunResult] :
 
-```python {title="streamed_run_result_messages.py" hl_lines="9 31"}
+```python {title="streamed_run_result_messages.py" hl_lines="9 40"}
 from pydantic_ai import Agent
 
 agent = Agent('openai:gpt-4o', system_prompt='Be a helpful assistant.')
@@ -142,6 +149,13 @@ async def main():
                         part_kind='text',
                     )
                 ],
+                usage=Usage(
+                    requests=0,
+                    request_tokens=50,
+                    response_tokens=12,
+                    total_tokens=62,
+                    details=None,
+                ),
                 model_name='gpt-4o',
                 timestamp=datetime.datetime(...),
                 kind='response',
@@ -201,6 +215,13 @@ print(result2.all_messages())
                 part_kind='text',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=60,
+            response_tokens=12,
+            total_tokens=72,
+            details=None,
+        ),
         model_name='gpt-4o',
         timestamp=datetime.datetime(...),
         kind='response',
@@ -223,6 +244,13 @@ print(result2.all_messages())
                 part_kind='text',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=61,
+            response_tokens=26,
+            total_tokens=87,
+            details=None,
+        ),
         model_name='gpt-4o',
         timestamp=datetime.datetime(...),
         kind='response',
@@ -285,7 +313,7 @@ The message format is independent of the model used, so you can use messages in
 
 In the example below, we reuse the message from the first agent run, which uses the `openai:gpt-4o` model, in a second agent run using the `google-gla:gemini-1.5-pro` model.
 
-```python {title="Reusing messages with a different model" hl_lines="11"}
+```python {title="Reusing messages with a different model" hl_lines="17"}
 from pydantic_ai import Agent
 
 agent = Agent('openai:gpt-4o', system_prompt='Be a helpful assistant.')
@@ -329,6 +357,13 @@ print(result2.all_messages())
                 part_kind='text',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=60,
+            response_tokens=12,
+            total_tokens=72,
+            details=None,
+        ),
         model_name='gpt-4o',
         timestamp=datetime.datetime(...),
         kind='response',
@@ -351,6 +386,13 @@ print(result2.all_messages())
                 part_kind='text',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=61,
+            response_tokens=26,
+            total_tokens=87,
+            details=None,
+        ),
         model_name='gemini-1.5-pro',
         timestamp=datetime.datetime(...),
         kind='response',
 
@@ -97,6 +97,7 @@ from pydantic_ai.messages import (
     UserPromptPart,
     ModelRequest,
 )
+from pydantic_ai.usage import Usage
 
 from fake_database import DatabaseConn
 from weather_app import run_weather_forecast, weather_agent
@@ -140,6 +141,13 @@ async def test_forecast():
                     tool_call_id=IsStr(),
                 )
             ],
+            usage=Usage(
+                requests=1,
+                request_tokens=71,
+                response_tokens=7,
+                total_tokens=78,
+                details=None,
+            ),
             model_name='test',
             timestamp=IsNow(tz=timezone.utc),
         ),
@@ -159,6 +167,13 @@ async def test_forecast():
                     content='{"weather_forecast":"Sunny with a chance of rain"}',
                 )
             ],
+            usage=Usage(
+                requests=1,
+                request_tokens=77,
+                response_tokens=16,
+                total_tokens=93,
+                details=None,
+            ),
             model_name='test',
             timestamp=IsNow(tz=timezone.utc),
         ),
 
@@ -96,6 +96,13 @@ print(dice_result.all_messages())
                 part_kind='tool-call',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=90,
+            response_tokens=2,
+            total_tokens=92,
+            details=None,
+        ),
         model_name='gemini-1.5-flash',
         timestamp=datetime.datetime(...),
         kind='response',
@@ -122,6 +129,13 @@ print(dice_result.all_messages())
                 part_kind='tool-call',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=91,
+            response_tokens=4,
+            total_tokens=95,
+            details=None,
+        ),
         model_name='gemini-1.5-flash',
         timestamp=datetime.datetime(...),
         kind='response',
@@ -146,6 +160,13 @@ print(dice_result.all_messages())
                 part_kind='text',
             )
         ],
+        usage=Usage(
+            requests=1,
+            request_tokens=92,
+            response_tokens=12,
+            total_tokens=104,
+            details=None,
+        ),
         model_name='gemini-1.5-flash',
         timestamp=datetime.datetime(...),
         kind='response',
 
@@ -301,16 +301,15 @@ async def _stream(
             ctx.state.message_history, model_settings, model_request_parameters
         ) as streamed_response:
             self._did_stream = True
-            ctx.state.usage.incr(_usage.Usage(), requests=1)
+            ctx.state.usage.requests += 1
             yield streamed_response
             # In case the user didn't manually consume the full stream, ensure it is fully consumed here,
             # otherwise usage won't be properly counted:
             async for _ in streamed_response:
                 pass
         model_response = streamed_response.get()
-        request_usage = streamed_response.usage()
 
-        self._finish_handling(ctx, model_response, request_usage)
+        self._finish_handling(ctx, model_response)
         assert self._result is not None  # this should be set by the previous line
 
     async def _make_request(
@@ -321,12 +320,12 @@ async def _make_request(
 
         model_settings, model_request_parameters = await self._prepare_request(ctx)
         model_request_parameters = ctx.deps.model.customize_request_parameters(model_request_parameters)
-        model_response, request_usage = await ctx.deps.model.request(
+        model_response = await ctx.deps.model.request(
             ctx.state.message_history, model_settings, model_request_parameters
         )
-        ctx.state.usage.incr(_usage.Usage(), requests=1)
+        ctx.state.usage.incr(_usage.Usage())
 
-        return self._finish_handling(ctx, model_response, request_usage)
+        return self._finish_handling(ctx, model_response)
 
     async def _prepare_request(
         self, ctx: GraphRunContext[GraphAgentState, GraphAgentDeps[DepsT, NodeRunEndT]]
@@ -348,10 +347,9 @@ def _finish_handling(
         self,
         ctx: GraphRunContext[GraphAgentState, GraphAgentDeps[DepsT, NodeRunEndT]],
         response: _messages.ModelResponse,
-        usage: _usage.Usage,
     ) -> CallToolsNode[DepsT, NodeRunEndT]:
         # Update usage
-        ctx.state.usage.incr(usage, requests=0)
+        ctx.state.usage.incr(response.usage)
         if ctx.deps.usage_limits:
             ctx.deps.usage_limits.check_tokens(ctx.state.usage)
 
 
@@ -551,6 +551,13 @@ async def main():
                 CallToolsNode(
                     model_response=ModelResponse(
                         parts=[TextPart(content='Paris', part_kind='text')],
+                        usage=Usage(
+                            requests=1,
+                            request_tokens=56,
+                            response_tokens=1,
+                            total_tokens=57,
+                            details=None,
+                        ),
                         model_name='gpt-4o',
                         timestamp=datetime.datetime(...),
                         kind='response',
@@ -1715,6 +1722,13 @@ async def main():
             CallToolsNode(
                 model_response=ModelResponse(
                     parts=[TextPart(content='Paris', part_kind='text')],
+                    usage=Usage(
+                        requests=1,
+                        request_tokens=56,
+                        response_tokens=1,
+                        total_tokens=57,
+                        details=None,
+                    ),
                     model_name='gpt-4o',
                     timestamp=datetime.datetime(...),
                     kind='response',
@@ -1853,6 +1867,13 @@ async def main():
                     CallToolsNode(
                         model_response=ModelResponse(
                             parts=[TextPart(content='Paris', part_kind='text')],
+                            usage=Usage(
+                                requests=1,
+                                request_tokens=56,
+                                response_tokens=1,
+                                total_tokens=57,
+                                details=None,
+                            ),
                             model_name='gpt-4o',
                             timestamp=datetime.datetime(...),
                             kind='response',
 
@@ -14,6 +14,7 @@
 
 from ._utils import generate_tool_call_id as _generate_tool_call_id, now_utc as _now_utc
 from .exceptions import UnexpectedModelBehavior
+from .usage import Usage
 
 AudioMediaType: TypeAlias = Literal['audio/wav', 'audio/mpeg']
 ImageMediaType: TypeAlias = Literal['image/jpeg', 'image/png', 'image/gif', 'image/webp']
@@ -554,6 +555,12 @@ class ModelResponse:
     parts: list[ModelResponsePart]
     """The parts of the model message."""
 
+    usage: Usage = field(default_factory=Usage)
+    """Usage information for the request.
+
+    This has a default to make tests easier, and to support loading old messages where usage will be missing.
+    """
+
     model_name: str | None = None
     """The name of the model that generated the response."""
 
 
@@ -278,7 +278,7 @@ async def request(
         messages: list[ModelMessage],
         model_settings: ModelSettings | None,
         model_request_parameters: ModelRequestParameters,
-    ) -> tuple[ModelResponse, Usage]:
+    ) -> ModelResponse:
         """Make a request to the model."""
         raise NotImplementedError()
 
@@ -365,7 +365,10 @@ async def _get_event_iterator(self) -> AsyncIterator[ModelResponseStreamEvent]:
     def get(self) -> ModelResponse:
         """Build a [`ModelResponse`][pydantic_ai.messages.ModelResponse] from the data received from the stream so far."""
         return ModelResponse(
-            parts=self._parts_manager.get_parts(), model_name=self.model_name, timestamp=self.timestamp
+            parts=self._parts_manager.get_parts(),
+            model_name=self.model_name,
+            timestamp=self.timestamp,
+            usage=self.usage(),
         )
 
     def usage(self) -> Usage: