Stop sharing tool retry count across all runs of the same agent (#1918)

dmontagu · web-flow · commit 41130b5dc215 · 2025-06-06T14:46:56.000Z
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -151,10 +151,6 @@ async def _get_first_message(
         ctx.state.message_history = history
         run_context.messages = history
 
-        # TODO: We need to make it so that function_tools are not shared between runs
-        #   See comment on the current_retry field of `Tool` for more details.
-        for tool in ctx.deps.function_tools.values():
-            tool.current_retry = 0
         return next_message
 
     async def _prepare_messages(
diff --git a/pydantic_ai_slim/pydantic_ai/agent.py b/pydantic_ai_slim/pydantic_ai/agent.py
@@ -646,11 +646,6 @@ async def main():
         # typecast reasonable, even though it is possible to violate it with otherwise-type-checked code.
         output_validators = cast(list[_output.OutputValidator[AgentDepsT, RunOutputDataT]], self._output_validators)
 
-        # TODO: Instead of this, copy the function tools to ensure they don't share current_retry state between agent
-        #  runs. Requires some changes to `Tool` to make them copyable though.
-        for v in self._function_tools.values():
-            v.current_retry = 0
-
         model_settings = merge_model_settings(self.model_settings, model_settings)
         usage_limits = usage_limits or _usage.UsageLimits()
 
@@ -679,6 +674,10 @@ async def get_instructions(run_context: RunContext[AgentDepsT]) -> str | None:
                 instructions += '\n' + await instructions_runner.run(run_context)
             return instructions.strip()
 
+        # Copy the function tools so that retry state is agent-run-specific
+        # Note that the retry count is reset to 0 when this happens due to the `default=0` and `init=False`.
+        run_function_tools = {k: dataclasses.replace(v) for k, v in self._function_tools.items()}
+
         graph_deps = _agent_graph.GraphAgentDeps[AgentDepsT, RunOutputDataT](
             user_deps=deps,
             prompt=user_prompt,
@@ -690,7 +689,7 @@ async def get_instructions(run_context: RunContext[AgentDepsT]) -> str | None:
             end_strategy=self.end_strategy,
             output_schema=output_schema,
             output_validators=output_validators,
-            function_tools=self._function_tools,
+            function_tools=run_function_tools,
             mcp_servers=self._mcp_servers,
             default_retries=self._default_retries,
             tracer=tracer,
diff --git a/pydantic_ai_slim/pydantic_ai/tools.py b/pydantic_ai_slim/pydantic_ai/tools.py
@@ -215,8 +215,10 @@ class Tool(Generic[AgentDepsT]):
     This schema may be modified by the `prepare` function or by the Model class prior to including it in an API request.
     """
 
-    # TODO: Move this state off the Tool class, which is otherwise stateless.
-    #   This should be tracked inside a specific agent run, not the tool.
+    # TODO: Consider moving this current_retry state to live on something other than the tool.
+    #   We've worked around this for now by copying instances of the tool when creating new runs,
+    #   but this is a bit fragile. Moving the tool retry counts to live on the agent run state would likely clean things
+    #   up, though is also likely a larger effort to refactor.
     current_retry: int = field(default=0, init=False)
 
     def __init__(
diff --git a/tests/models/test_model_test.py b/tests/models/test_model_test.py
@@ -2,11 +2,14 @@
 
 from __future__ import annotations as _annotations
 
+import asyncio
+import dataclasses
 from datetime import timezone
 from typing import Annotated, Any, Literal
 
 import pytest
 from annotated_types import Ge, Gt, Le, Lt, MaxLen, MinLen
+from anyio import Event
 from inline_snapshot import snapshot
 from pydantic import BaseModel, Field
 
@@ -160,6 +163,36 @@ def validate_output(ctx: RunContext[None], output: OutputModel) -> OutputModel:
     assert call_count == 3
 
 
+@dataclasses.dataclass
+class AgentRunDeps:
+    run_id: int
+
+
+@pytest.mark.anyio
+async def test_multiple_concurrent_tool_retries():
+    class OutputModel(BaseModel):
+        x: int
+        y: str
+
+    agent = Agent('test', deps_type=AgentRunDeps, output_type=OutputModel, retries=2)
+    retried_run_ids = set[int]()
+    event = Event()
+
+    run_ids = list(range(5))  # fire off 5 run ids that will all retry the tool before they finish
+
+    @agent.tool
+    async def tool_that_must_be_retried(ctx: RunContext[AgentRunDeps]) -> None:
+        if ctx.deps.run_id not in retried_run_ids:
+            retried_run_ids.add(ctx.deps.run_id)
+            raise ModelRetry('Fail')
+        if len(retried_run_ids) == len(run_ids):  # pragma: no branch  # won't branch if all runs happen very quickly
+            event.set()
+        await event.wait()  # ensure a retry is done by all runs before any of them finish their flow
+        return None
+
+    await asyncio.gather(*[agent.run('Hello', model=TestModel(), deps=AgentRunDeps(run_id)) for run_id in run_ids])
+
+
 def test_output_tool_retry_error_handled_with_custom_args(set_event_loop: None):
     class ResultModel(BaseModel):
         x: int
diff --git a/tests/test_live.py b/tests/test_live.py
@@ -93,7 +93,7 @@ def cohere(http_client: httpx.AsyncClient, _tmp_path: Path) -> Model:
     pytest.param(anthropic, id='anthropic'),
     pytest.param(ollama, id='ollama'),
     pytest.param(mistral, id='mistral'),
-    pytest.param(cohere, id='cohere'),
+    pytest.param(cohere, id='cohere', marks=pytest.mark.skip(reason='Might be causing hangs in CI')),
 ]
 GetModel = Callable[[httpx.AsyncClient, Path], Model]
 

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ def cohere(http_client: httpx.AsyncClient, _tmp_path: Path) -> Model:`
`93`	`93`	`pytest.param(anthropic, id='anthropic'),`
`94`	`94`	`pytest.param(ollama, id='ollama'),`
`95`	`95`	`pytest.param(mistral, id='mistral'),`
`96`		`- pytest.param(cohere, id='cohere'),`
	`96`	`+ pytest.param(cohere, id='cohere', marks=pytest.mark.skip(reason='Might be causing hangs in CI')),`
`97`	`97`	`]`
`98`	`98`	`GetModel = Callable[[httpx.AsyncClient, Path], Model]`
`99`	`99`