Allow definition of Model Settings for LLMJudge (#1662)

assadyousuf · web-flow · commit 448c7b71f725 · 2025-05-13T13:34:59.000+02:00
diff --git a/pydantic_evals/pydantic_evals/evaluators/common.py b/pydantic_evals/pydantic_evals/evaluators/common.py
@@ -5,6 +5,7 @@
 from typing import Any, cast
 
 from pydantic_ai import models
+from pydantic_ai.settings import ModelSettings
 
 from ..otel.span_tree import SpanQuery
 from .context import EvaluatorContext
@@ -164,6 +165,7 @@ class LLMJudge(Evaluator[object, object, object]):
     rubric: str
     model: models.Model | models.KnownModelName | None = None
     include_input: bool = False
+    model_settings: ModelSettings | None = None
 
     async def evaluate(
         self,
@@ -172,11 +174,13 @@ async def evaluate(
         if self.include_input:
             from .llm_as_a_judge import judge_input_output
 
-            grading_output = await judge_input_output(ctx.inputs, ctx.output, self.rubric, self.model)
+            grading_output = await judge_input_output(
+                ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
+            )
         else:
             from .llm_as_a_judge import judge_output
 
-            grading_output = await judge_output(ctx.output, self.rubric, self.model)
+            grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
         return EvaluationReason(value=grading_output.pass_, reason=grading_output.reason)
 
     def build_serialization_arguments(self):
diff --git a/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py b/pydantic_evals/pydantic_evals/evaluators/llm_as_a_judge.py
@@ -7,6 +7,7 @@
 from pydantic_core import to_json
 
 from pydantic_ai import Agent, models
+from pydantic_ai.settings import ModelSettings
 
 __all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
 
@@ -44,15 +45,20 @@ class GradingOutput(BaseModel, populate_by_name=True):
 
 
 async def judge_output(
-    output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None
+    output: Any,
+    rubric: str,
+    model: models.Model | models.KnownModelName | None = None,
+    model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on a rubric.
 
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
     user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
-    return (await _judge_output_agent.run(user_prompt, model=model or _default_model)).output
+    return (
+        await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
+    ).output
 
 
 _judge_input_output_agent = Agent(
@@ -79,15 +85,21 @@ async def judge_output(
 
 
 async def judge_input_output(
-    inputs: Any, output: Any, rubric: str, model: models.Model | models.KnownModelName | None = None
+    inputs: Any,
+    output: Any,
+    rubric: str,
+    model: models.Model | models.KnownModelName | None = None,
+    model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the inputs and a rubric.
 
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
     user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
-    return (await _judge_input_output_agent.run(user_prompt, model=model or _default_model)).output
+    return (
+        await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
+    ).output
 
 
 def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:  # pragma: no cover
diff --git a/tests/evals/test_evaluator_common.py b/tests/evals/test_evaluator_common.py
@@ -7,6 +7,8 @@
 from inline_snapshot import snapshot
 from pytest_mock import MockerFixture
 
+from pydantic_ai.settings import ModelSettings
+
 from ..conftest import try_import
 
 with try_import() as imports_successful:
@@ -222,7 +224,7 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
     assert result.value is True
     assert result.reason == 'Test passed'
 
-    mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None)
+    mock_judge_output.assert_called_once_with('Hello world', 'Content contains a greeting', None, None)
 
     # Test with input
     evaluator = LLMJudge(rubric='Output contains input', include_input=True, model='openai:gpt-4o')
@@ -232,7 +234,7 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
     assert result.reason == 'Test passed'
 
     mock_judge_input_output.assert_called_once_with(
-        {'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o'
+        {'prompt': 'Hello'}, 'Hello world', 'Output contains input', 'openai:gpt-4o', None
     )
 
     # Test with failing result
@@ -244,6 +246,61 @@ async def test_llm_judge_evaluator(mocker: MockerFixture):
     assert result.reason == 'Test failed'
 
 
+@pytest.mark.anyio
+async def test_llm_judge_evaluator_with_model_settings(mocker: MockerFixture):
+    """Test LLMJudge evaluator with specific model_settings."""
+    mock_grading_output = mocker.MagicMock()
+    mock_grading_output.pass_ = True
+    mock_grading_output.reason = 'Test passed with settings'
+
+    mock_judge_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_output')
+    mock_judge_output.return_value = mock_grading_output
+
+    mock_judge_input_output = mocker.patch('pydantic_evals.evaluators.llm_as_a_judge.judge_input_output')
+    mock_judge_input_output.return_value = mock_grading_output
+
+    custom_model_settings = ModelSettings(temperature=0.77)
+
+    ctx = EvaluatorContext(
+        name='test_custom_settings',
+        inputs={'prompt': 'Hello Custom'},
+        metadata=None,
+        expected_output=None,
+        output='Hello world custom settings',
+        duration=0.0,
+        _span_tree=SpanTreeRecordingError('spans were not recorded'),
+        attributes={},
+        metrics={},
+    )
+
+    # Test without input, with custom model_settings
+    evaluator_no_input = LLMJudge(rubric='Greeting with custom settings', model_settings=custom_model_settings)
+    result_no_input = await evaluator_no_input.evaluate(ctx)
+    assert result_no_input.value is True
+    assert result_no_input.reason == 'Test passed with settings'
+    mock_judge_output.assert_called_once_with(
+        'Hello world custom settings', 'Greeting with custom settings', None, custom_model_settings
+    )
+
+    # Test with input, with custom model_settings
+    evaluator_with_input = LLMJudge(
+        rubric='Output contains input with custom settings',
+        include_input=True,
+        model='openai:gpt-3.5-turbo',
+        model_settings=custom_model_settings,
+    )
+    result_with_input = await evaluator_with_input.evaluate(ctx)
+    assert result_with_input.value is True
+    assert result_with_input.reason == 'Test passed with settings'
+    mock_judge_input_output.assert_called_once_with(
+        {'prompt': 'Hello Custom'},
+        'Hello world custom settings',
+        'Output contains input with custom settings',
+        'openai:gpt-3.5-turbo',
+        custom_model_settings,
+    )
+
+
 async def test_python():
     """Test Python evaluator."""
     evaluator = Python(expression='ctx.output > 0')
diff --git a/tests/evals/test_llm_as_a_judge.py b/tests/evals/test_llm_as_a_judge.py
@@ -6,6 +6,7 @@
 from ..conftest import try_import
 
 with try_import() as imports_successful:
+    from pydantic_ai.settings import ModelSettings
     from pydantic_evals.evaluators.llm_as_a_judge import (
         GradingOutput,
         _stringify,  # pyright: ignore[reportPrivateUsage]
@@ -87,6 +88,34 @@ async def test_judge_output_mock(mocker: MockerFixture):
     assert '<Rubric>\nContent contains a greeting\n</Rubric>' in call_args[0]
 
 
+@pytest.mark.anyio
+async def test_judge_output_with_model_settings_mock(mocker: MockerFixture):
+    """Test judge_output function with model_settings and mocked agent."""
+    mock_result = mocker.MagicMock()
+    mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
+    mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)
+
+    test_model_settings = ModelSettings(temperature=1)
+
+    grading_output = await judge_output(
+        'Hello world settings',
+        'Content contains a greeting with settings',
+        model_settings=test_model_settings,
+    )
+    assert isinstance(grading_output, GradingOutput)
+    assert grading_output.reason == 'Test passed with settings'
+    assert grading_output.pass_ is True
+    assert grading_output.score == 1.0
+
+    mock_run.assert_called_once()
+    call_args, call_kwargs = mock_run.call_args
+    assert '<Output>\nHello world settings\n</Output>' in call_args[0]
+    assert '<Rubric>\nContent contains a greeting with settings\n</Rubric>' in call_args[0]
+    assert call_kwargs['model_settings'] == test_model_settings
+    # Check if 'model' kwarg is passed, its value will be the default model or None
+    assert 'model' in call_kwargs
+
+
 @pytest.mark.anyio
 async def test_judge_input_output_mock(mocker: MockerFixture):
     """Test judge_input_output function with mocked agent."""
@@ -108,3 +137,33 @@ async def test_judge_input_output_mock(mocker: MockerFixture):
     assert '<Input>\nHello\n</Input>' in call_args[0]
     assert '<Output>\nHello world\n</Output>' in call_args[0]
     assert '<Rubric>\nOutput contains input\n</Rubric>' in call_args[0]
+
+
+@pytest.mark.anyio
+async def test_judge_input_output_with_model_settings_mock(mocker: MockerFixture):
+    """Test judge_input_output function with model_settings and mocked agent."""
+    mock_result = mocker.MagicMock()
+    mock_result.output = GradingOutput(reason='Test passed with settings', pass_=True, score=1.0)
+    mock_run = mocker.patch('pydantic_ai.Agent.run', return_value=mock_result)
+
+    test_model_settings = ModelSettings(temperature=1)
+
+    result = await judge_input_output(
+        'Hello settings',
+        'Hello world with settings',
+        'Output contains input with settings',
+        model_settings=test_model_settings,
+    )
+    assert isinstance(result, GradingOutput)
+    assert result.reason == 'Test passed with settings'
+    assert result.pass_ is True
+    assert result.score == 1.0
+
+    mock_run.assert_called_once()
+    call_args, call_kwargs = mock_run.call_args
+    assert '<Input>\nHello settings\n</Input>' in call_args[0]
+    assert '<Output>\nHello world with settings\n</Output>' in call_args[0]
+    assert '<Rubric>\nOutput contains input with settings\n</Rubric>' in call_args[0]
+    assert call_kwargs['model_settings'] == test_model_settings
+    # Check if 'model' kwarg is passed, its value will be the default model or None
+    assert 'model' in call_kwargs