fixing toolsets to tools

PabloCabaleiro · PabloCabaleiro · commit 9e8c63690934 · 2025-10-15T15:30:02.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,14 @@ The format follows the principles of [Keep a Changelog](https://keepachangelog.c
 ## [Unreleased](https://github.com/PabloCabaleiro/pondera/tree/main)
 
 
+## [v0.6.1](https://github.com/PabloCabaleiro/pondera/releases/tag/v0.6.1) - 2025-10-15
+
+<!-- markdownlint-disable-next-line MD024 -->
+### Fixed
+
+- Renamed `toolsets` parameter to `tools` in Judge constructor and internal implementation for consistency with PydanticAI API
+
+
 ## [v0.6.0](https://github.com/PabloCabaleiro/pondera/releases/tag/v0.6.0) - 2025-10-13
 
 <!-- markdownlint-disable-next-line MD024 -->
diff --git a/README.md b/README.md
@@ -101,9 +101,9 @@ class ConstantJudge(JudgeProtocol):
 
 ```bash
 # Using uv (recommended)
-uv add 'git+ssh://git@github.com/PabloCabaleiro/pondera.git@v0.6.0'
+uv add 'git+ssh://git@github.com/PabloCabaleiro/pondera.git@v0.6.1'
 # or from source in editable mode
-uv pip install 'git+ssh://git@github.com/PabloCabaleiro/pondera.git@v0.6.0'
+uv pip install 'git+ssh://git@github.com/PabloCabaleiro/pondera.git@v0.6.1'
 ```
 
 The judge uses the pydantic-ai ecosystem. Configure provider credentials via env vars (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `AZURE_OPENAI_API_KEY`, etc.) plus optional `PONDERA_` settings.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "pondera"
-version = "0.6.0"
+version = "0.6.1"
 description = "YAML-first, pluggable runners & LLM-as-a-judge evaluation framework."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/src/pondera/__init__.py b/src/pondera/__init__.py
@@ -22,4 +22,4 @@
     "ValidationError",
 ]
 
-__version__ = "0.6.0"
+__version__ = "0.6.1"
diff --git a/src/pondera/judge/base.py b/src/pondera/judge/base.py
@@ -19,12 +19,12 @@ def __init__(
         model: str | None = None,
         rubric: list[RubricCriterion] | None = None,
         system_append: str = "",
-        toolsets: tuple[Any, ...] = (),
+        tools: tuple[Any, ...] = (),
     ) -> None:
         self._default_rubric = rubric or default_rubric()
         self._system_append = system_append
         self._model = model
-        self._toolsets = toolsets
+        self._tools = tools
 
     async def judge(
         self,
@@ -44,7 +44,7 @@ async def judge(
         use_system = self._system_prompt(
             rb, self._system_append + ("\n" + system_append if system_append else "")
         )
-        agent = get_agent(system_prompt=use_system, output_type=Judgment, toolsets=self._toolsets)
+        agent = get_agent(system_prompt=use_system, output_type=Judgment, tools=self._tools)
 
         files_section = "\n".join(f"- {p}" for p in (files or [])) or "(none)"
 
diff --git a/src/pondera/judge/pydantic_ai.py b/src/pondera/judge/pydantic_ai.py
@@ -163,7 +163,6 @@ def get_agent(
     instructions: str | None = None,
     system_prompt: str | tuple[str, ...] = (),
     tools: tuple[Any, ...] = (),
-    toolsets: tuple[Any, ...] = (),
     model_settings: ModelSettings | None = None,
     output_type: Any = str,
     deps_type: type = NoneType,
@@ -184,7 +183,6 @@ def get_agent(
         deps_type=deps_type,
         model_settings=model_settings,
         tools=tools,
-        toolsets=toolsets,
         instrument=True,
     )
     return agent
diff --git a/tests/judge/test_base.py b/tests/judge/test_base.py
@@ -98,10 +98,10 @@ async def test_judge_user_prompt_format(
 @patch("pondera.judge.base.run_agent")
 @patch("pondera.judge.base.default_rubric")
 @pytest.mark.asyncio
-async def test_judge_calls_get_agent_with_toolsets(
+async def test_judge_calls_get_agent_with_tools(
     mock_default_rubric: Any, mock_run_agent: Any, mock_get_agent: Any
 ) -> None:
-    """Test that judge passes toolsets to get_agent."""
+    """Test that judge passes tools to get_agent."""
 
     def sample_tool(x: int) -> int:
         """Sample tool that doubles input."""
@@ -116,7 +116,7 @@ def sample_tool(x: int) -> int:
         score=90, evaluation_passed=True, reasoning="Excellent", criteria_scores={"accuracy": 90}
     )
     mock_run_agent.return_value = (expected_judgment, [])
-    judge = Judge(toolsets=(sample_tool,))
+    judge = Judge(tools=(sample_tool,))
 
     await judge.judge(
         question="What is 2+2?",
@@ -127,7 +127,7 @@ def sample_tool(x: int) -> int:
 
     mock_get_agent.assert_called_once()
     call_kwargs = mock_get_agent.call_args.kwargs
-    assert "toolsets" in call_kwargs
-    assert call_kwargs["toolsets"] == (sample_tool,)
+    assert "tools" in call_kwargs
+    assert call_kwargs["tools"] == (sample_tool,)
     assert call_kwargs["output_type"] == Judgment
     mock_run_agent.assert_called_once()
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -22,4 +22,4 @@`
`22`	`22`	`"ValidationError",`
`23`	`23`	`]`
`24`	`24`
`25`		`-__version__ = "0.6.0"`
	`25`	`+__version__ = "0.6.1"`