Add progress bar on evaluate (#1871)

davide-andreoli · Kludex · web-flow · commit 00ffe0c21cd2 · 2025-06-26T10:07:55.000+02:00
Co-authored-by: Marcelo Trylesinski &lt;marcelotryle@gmail.com&gt;
diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -15,7 +15,7 @@
 import time
 import warnings
 from collections.abc import Awaitable, Mapping, Sequence
-from contextlib import AsyncExitStack
+from contextlib import AsyncExitStack, nullcontext
 from contextvars import ContextVar
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -28,6 +28,7 @@
 from pydantic._internal import _typing_extra
 from pydantic_core import to_json
 from pydantic_core.core_schema import SerializationInfo, SerializerFunctionWrapHandler
+from rich.progress import Progress
 from typing_extensions import NotRequired, Self, TypedDict, TypeVar
 
 from pydantic_evals._utils import get_event_loop
@@ -251,7 +252,11 @@ def __init__(
         )
 
     async def evaluate(
-        self, task: Callable[[InputsT], Awaitable[OutputT]], name: str | None = None, max_concurrency: int | None = None
+        self,
+        task: Callable[[InputsT], Awaitable[OutputT]],
+        name: str | None = None,
+        max_concurrency: int | None = None,
+        progress: bool = True,
     ) -> EvaluationReport:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -265,18 +270,26 @@ async def evaluate(
                 If omitted, the name of the task function will be used.
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
+            progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
 
         Returns:
             A report containing the results of the evaluation.
         """
         name = name or get_unwrapped_function_name(task)
+        total_cases = len(self.cases)
+        progress_bar = Progress() if progress else None
 
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
-        with _logfire.span('evaluate {name}', name=name) as eval_span:
+
+        with _logfire.span('evaluate {name}', name=name) as eval_span, progress_bar or nullcontext():
+            task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
 
             async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
                 async with limiter:
-                    return await _run_task_and_evaluators(task, case, report_case_name, self.evaluators)
+                    result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators)
+                    if progress_bar and task_id is not None:  # pragma: no branch
+                        progress_bar.update(task_id, advance=1)
+                    return result
 
             report = EvaluationReport(
                 name=name,
@@ -291,11 +304,14 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
             eval_span.set_attribute('cases', report.cases)
             # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
             eval_span.set_attribute('averages', report.averages())
-
         return report
 
     def evaluate_sync(
-        self, task: Callable[[InputsT], Awaitable[OutputT]], name: str | None = None, max_concurrency: int | None = None
+        self,
+        task: Callable[[InputsT], Awaitable[OutputT]],
+        name: str | None = None,
+        max_concurrency: int | None = None,
+        progress: bool = True,
     ) -> EvaluationReport:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -308,11 +324,14 @@ def evaluate_sync(
                 If omitted, the name of the task function will be used.
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
+            progress: Whether to show a progress bar for the evaluation. Defaults to True.
 
         Returns:
             A report containing the results of the evaluation.
         """
-        return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
+        return get_event_loop().run_until_complete(
+            self.evaluate(task, name=name, max_concurrency=max_concurrency, progress=progress)
+        )
 
     def add_case(
         self,