Skip to content

Commit d52f7a5

Browse files
authored
Change the output interface of evaluate (#8003)
* change the output interface of evaluate * make the usage consistent * clean up remaining codes * fix mipro * remove all_scores * format comment * rename outputs * rename it to results * pass empty results * introduce EvaluationResult class * address comments * remove dspy.Prediction.__eq__ method * lint * fix test_eval_candidate_program_failure * fix grpo * fix grpo
1 parent 9bdfd3f commit d52f7a5

File tree

17 files changed

+97
-139
lines changed

17 files changed

+97
-139
lines changed

docs/docs/tutorials/agents/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -500,23 +500,20 @@
500500
" metric=top5_recall,\n",
501501
" num_threads=16,\n",
502502
" display_progress=True,\n",
503-
" # To record the outputs and detailed scores to MLflow\n",
504-
" return_all_scores=True,\n",
505-
" return_outputs=True,\n",
506503
" )\n",
507504
"\n",
508505
" # Evaluate the program as usual\n",
509-
" aggregated_score, outputs, all_scores = evaluate(cot)\n",
506+
" result = evaluate(cot)\n",
510507
"\n",
511508
" # Log the aggregated score\n",
512-
" mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
509+
" mlflow.log_metric(\"top5_recall\", result.score)\n",
513510
" # Log the detailed evaluation results as a table\n",
514511
" mlflow.log_table(\n",
515512
" {\n",
516513
" \"Claim\": [example.claim for example in eval_set],\n",
517514
" \"Expected Titles\": [example.titles for example in eval_set],\n",
518-
" \"Predicted Titles\": outputs,\n",
519-
" \"Top 5 Recall\": all_scores,\n",
515+
" \"Predicted Titles\": [output[1] for output in result.results],\n",
516+
" \"Top 5 Recall\": [output[2] for output in result.results],\n",
520517
" },\n",
521518
" artifact_file=\"eval_results.json\",\n",
522519
" )\n",

docs/docs/tutorials/classification_finetuning/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -568,23 +568,20 @@
568568
" metric=extraction_correctness_metric,\n",
569569
" num_threads=16,\n",
570570
" display_progress=True,\n",
571-
" # To record the outputs and detailed scores to MLflow\n",
572-
" return_all_scores=True,\n",
573-
" return_outputs=True,\n",
574571
" )\n",
575572
"\n",
576573
" # Evaluate the program as usual\n",
577-
" aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
574+
" result = evaluate_correctness(people_extractor)\n",
578575
"\n",
579576
" # Log the aggregated score\n",
580-
" mlflow.log_metric(\"exact_match\", aggregated_score)\n",
577+
" mlflow.log_metric(\"exact_match\", result.score)\n",
581578
" # Log the detailed evaluation results as a table\n",
582579
" mlflow.log_table(\n",
583580
" {\n",
584581
" \"Text\": [example.text for example in devset],\n",
585582
" \"Expected\": [example.example_label for example in devset],\n",
586-
" \"Predicted\": outputs,\n",
587-
" \"Exact match\": all_scores,\n",
583+
" \"Predicted\": [output[1] for output in result.results],\n",
584+
" \"Exact match\": [output[2] for output in result.results],\n",
588585
" },\n",
589586
" artifact_file=\"eval_results.json\",\n",
590587
" )\n",

docs/docs/tutorials/entity_extraction/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -514,23 +514,20 @@
514514
" metric=extraction_correctness_metric,\n",
515515
" num_threads=24,\n",
516516
" display_progress=True,\n",
517-
" # To record the outputs and detailed scores to MLflow\n",
518-
" return_all_scores=True,\n",
519-
" return_outputs=True,\n",
520517
" )\n",
521518
"\n",
522519
" # Evaluate the program as usual\n",
523-
" aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
520+
" result = evaluate_correctness(people_extractor)\n",
524521
"\n",
525522
" # Log the aggregated score\n",
526-
" mlflow.log_metric(\"exact_match\", aggregated_score)\n",
523+
" mlflow.log_metric(\"exact_match\", result.score)\n",
527524
" # Log the detailed evaluation results as a table\n",
528525
" mlflow.log_table(\n",
529526
" {\n",
530527
" \"Tokens\": [example.tokens for example in test_set],\n",
531528
" \"Expected\": [example.expected_extracted_people for example in test_set],\n",
532-
" \"Predicted\": outputs,\n",
533-
" \"Exact match\": all_scores,\n",
529+
" \"Predicted\": [output[1] for output in result.results],\n",
530+
" \"Exact match\": [output[2] for output in result.results],\n",
534531
" },\n",
535532
" artifact_file=\"eval_results.json\",\n",
536533
" )\n",

docs/docs/tutorials/math/index.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -369,21 +369,21 @@
369369
"\n",
370370
"# Start an MLflow Run to record the evaluation\n",
371371
"with mlflow.start_run(run_name=\"math_evaluation\"):\n",
372-
" kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n",
372+
" kwargs = dict(num_threads=THREADS, display_progress=True)\n",
373373
" evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
374374
"\n",
375375
" # Evaluate the program as usual\n",
376-
" aggregated_score, outputs, all_scores = evaluate(module)\n",
376+
" result = evaluate(module)\n",
377377
"\n",
378378
" # Log the aggregated score\n",
379-
" mlflow.log_metric(\"correctness\", aggregated_score)\n",
379+
" mlflow.log_metric(\"correctness\", result.score)\n",
380380
" # Log the detailed evaluation results as a table\n",
381381
" mlflow.log_table(\n",
382382
" {\n",
383383
" \"Question\": [example.question for example in dataset.dev],\n",
384384
" \"Gold Answer\": [example.answer for example in dataset.dev],\n",
385-
" \"Predicted Answer\": outputs,\n",
386-
" \"Correctness\": all_scores,\n",
385+
" \"Predicted Answer\": [output[1] for output in result.results],\n",
386+
" \"Correctness\": [output[2] for output in result.results],\n",
387387
" },\n",
388388
" artifact_file=\"eval_results.json\",\n",
389389
" )\n",

docs/docs/tutorials/multihop_search/index.ipynb

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -534,23 +534,20 @@
534534
" metric=top5_recall,\n",
535535
" num_threads=16,\n",
536536
" display_progress=True,\n",
537-
" # To record the outputs and detailed scores to MLflow\n",
538-
" return_all_scores=True,\n",
539-
" return_outputs=True,\n",
540537
" )\n",
541538
"\n",
542539
" # Evaluate the program as usual\n",
543-
" aggregated_score, outputs, all_scores = evaluate(Hop())\n",
540+
" result = evaluate(Hop())\n",
544541
"\n",
545542
" # Log the aggregated score\n",
546-
" mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
543+
" mlflow.log_metric(\"top5_recall\", result.score)\n",
547544
" # Log the detailed evaluation results as a table\n",
548545
" mlflow.log_table(\n",
549546
" {\n",
550547
" \"Claim\": [example.claim for example in eval_set],\n",
551548
" \"Expected Titles\": [example.titles for example in eval_set],\n",
552-
" \"Predicted Titles\": outputs,\n",
553-
" \"Top 5 Recall\": all_scores,\n",
549+
" \"Predicted Titles\": [output[1] for output in result.results],\n",
550+
" \"Top 5 Recall\": [output[2] for output in result.results],\n",
554551
" },\n",
555552
" artifact_file=\"eval_results.json\",\n",
556553
" )\n",

docs/docs/tutorials/rag/index.ipynb

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -731,24 +731,21 @@
731731
" metric=metric,\n",
732732
" num_threads=24,\n",
733733
" display_progress=True,\n",
734-
" # To record the outputs and detailed scores to MLflow\n",
735-
" return_all_scores=True,\n",
736-
" return_outputs=True,\n",
737734
" )\n",
738735
"\n",
739736
" # Evaluate the program as usual\n",
740-
" aggregated_score, outputs, all_scores = evaluate(cot)\n",
737+
" result = evaluate(cot)\n",
741738
"\n",
742739
"\n",
743740
" # Log the aggregated score\n",
744-
" mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n",
741+
" mlflow.log_metric(\"semantic_f1_score\", result.score)\n",
745742
" # Log the detailed evaluation results as a table\n",
746743
" mlflow.log_table(\n",
747744
" {\n",
748745
" \"Question\": [example.question for example in eval_set],\n",
749746
" \"Gold Response\": [example.response for example in eval_set],\n",
750-
" \"Predicted Response\": outputs,\n",
751-
" \"Semantic F1 Score\": all_scores,\n",
747+
" \"Predicted Response\": [output[1] for output in result.results],\n",
748+
" \"Semantic F1 Score\": [output[2] for output in result.results],\n",
752749
" },\n",
753750
" artifact_file=\"eval_results.json\",\n",
754751
" )\n",
@@ -1471,6 +1468,11 @@
14711468
"\n",
14721469
"The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
14731470
]
1471+
},
1472+
{
1473+
"cell_type": "markdown",
1474+
"metadata": {},
1475+
"source": []
14741476
}
14751477
],
14761478
"metadata": {

dspy/evaluate/evaluate.py

Lines changed: 26 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import tqdm
1010

1111
import dspy
12+
from dspy.primitives.prediction import Prediction
1213
from dspy.utils.callback import with_callbacks
1314
from dspy.utils.parallelizer import ParallelExecutor
1415

@@ -42,6 +43,21 @@ def HTML(x: str) -> str: # noqa: N802
4243
logger = logging.getLogger(__name__)
4344

4445

46+
class EvaluationResult(Prediction):
47+
"""
48+
A class that represents the result of an evaluation.
49+
It is a subclass of `dspy.Prediction` that contains the following fields
50+
51+
- score: An float value (e.g., 67.30) representing the overall performance
52+
- results: a list of (example, prediction, score) tuples for each example in devset
53+
"""
54+
def __init__(self, score: float, results: list[Tuple["dspy.Example", "dspy.Example", Any]]):
55+
super().__init__(score=score, results=results)
56+
57+
def __repr__(self):
58+
return f"EvaluationResult(score={self.score}, results=<list of {len(self.results)} results>)"
59+
60+
4561
class Evaluate:
4662
"""DSPy Evaluate class.
4763
@@ -58,8 +74,6 @@ def __init__(
5874
display_progress: bool = False,
5975
display_table: Union[bool, int] = False,
6076
max_errors: Optional[int] = None,
61-
return_all_scores: bool = False,
62-
return_outputs: bool = False,
6377
provide_traceback: Optional[bool] = None,
6478
failure_score: float = 0.0,
6579
**kwargs,
@@ -74,8 +88,6 @@ def __init__(
7488
If a number is passed, the evaluation results will be truncated to that number before displayed.
7589
max_errors (Optional[int]): The maximum number of errors to allow before
7690
stopping evaluation. If ``None``, inherits from ``dspy.settings.max_errors``.
77-
return_all_scores (bool): Whether to return scores for every data record in `devset`.
78-
return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`.
7991
provide_traceback (Optional[bool]): Whether to provide traceback information during evaluation.
8092
failure_score (float): The default score to use if evaluation fails due to an exception.
8193
"""
@@ -85,8 +97,6 @@ def __init__(
8597
self.display_progress = display_progress
8698
self.display_table = display_table
8799
self.max_errors = max_errors
88-
self.return_all_scores = return_all_scores
89-
self.return_outputs = return_outputs
90100
self.provide_traceback = provide_traceback
91101
self.failure_score = failure_score
92102

@@ -99,10 +109,8 @@ def __call__(
99109
num_threads: Optional[int] = None,
100110
display_progress: Optional[bool] = None,
101111
display_table: Optional[Union[bool, int]] = None,
102-
return_all_scores: Optional[bool] = None,
103-
return_outputs: Optional[bool] = None,
104112
callback_metadata: Optional[dict[str, Any]] = None,
105-
):
113+
) -> EvaluationResult:
106114
"""
107115
Args:
108116
program (dspy.Module): The DSPy program to evaluate.
@@ -114,36 +122,20 @@ def __call__(
114122
`self.display_progress`.
115123
display_table (Union[bool, int]): Whether to display the evaluation results in a table. if not provided, use
116124
`self.display_table`. If a number is passed, the evaluation results will be truncated to that number before displayed.
117-
return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided,
118-
use `self.return_all_scores`.
119-
return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not
120-
provided, use `self.return_outputs`.
121125
callback_metadata (dict): Metadata to be used for evaluate callback handlers.
122126
123127
Returns:
124-
The evaluation results are returned in different formats based on the flags:
125-
126-
- Base return: A float percentage score (e.g., 67.30) representing overall performance
127-
128-
- With `return_all_scores=True`:
129-
Returns (overall_score, individual_scores) where individual_scores is a list of
130-
float scores for each example in devset
131-
132-
- With `return_outputs=True`:
133-
Returns (overall_score, result_triples) where result_triples is a list of
134-
(example, prediction, score) tuples for each example in devset
135-
136-
- With both flags=True:
137-
Returns (overall_score, result_triples, individual_scores)
138-
128+
The evaluation results are returned as a dspy.EvaluationResult object containing the following attributes:
129+
130+
- score: A float percentage score (e.g., 67.30) representing overall performance
131+
132+
- results: a list of (example, prediction, score) tuples for each example in devset
139133
"""
140134
metric = metric if metric is not None else self.metric
141135
devset = devset if devset is not None else self.devset
142136
num_threads = num_threads if num_threads is not None else self.num_threads
143137
display_progress = display_progress if display_progress is not None else self.display_progress
144138
display_table = display_table if display_table is not None else self.display_table
145-
return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
146-
return_outputs = return_outputs if return_outputs is not None else self.return_outputs
147139

148140
if callback_metadata:
149141
logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
@@ -194,14 +186,11 @@ def process_item(example):
194186
else:
195187
logger.warning("Skipping table display since `pandas` is not installed.")
196188

197-
if return_all_scores and return_outputs:
198-
return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
199-
if return_all_scores:
200-
return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
201-
if return_outputs:
202-
return round(100 * ncorrect / ntotal, 2), results
189+
return EvaluationResult(
190+
score=round(100 * ncorrect / ntotal, 2),
191+
results=results,
192+
)
203193

204-
return round(100 * ncorrect / ntotal, 2)
205194

206195
def _construct_result_table(
207196
self, results: list[Tuple["dspy.Example", "dspy.Example", Any]], metric_name: str

dspy/teleprompt/bootstrap_finetune.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,6 @@ def bootstrap_trace_data(
231231
devset=dataset,
232232
num_threads=num_threads,
233233
display_progress=True,
234-
return_outputs=True,
235234
provide_traceback=False, # TODO(check with team)
236235
max_errors=len(dataset) * 10, # TODO(check with team)
237236
failure_score=failure_score,
@@ -290,10 +289,10 @@ def wrapped_program(**kwargs):
290289

291290
return failed_pred, trace
292291

293-
_, outputs = evaluator(wrapped_program, metric=wrapped_metric)
292+
results = evaluator(wrapped_program, metric=wrapped_metric).results
294293

295294
data = []
296-
for example_ind, (example, prediction, score) in enumerate(outputs):
295+
for example_ind, (example, prediction, score) in enumerate(results):
297296
try:
298297
prediction, trace = prediction
299298
except ValueError as ve:

dspy/teleprompt/copro_optimizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ def compile(self, student, *, trainset, eval_kwargs):
225225
f"At Depth {d+1}/{self.depth}, Evaluating Prompt Candidate #{c_i+1}/{len(candidates_)} for "
226226
f"Predictor {p_i+1} of {len(module.predictors())}.",
227227
)
228-
score = evaluate(module_clone, devset=trainset, **eval_kwargs)
228+
score = evaluate(module_clone, devset=trainset, **eval_kwargs).score
229229
if self.prompt_model:
230230
logger.debug(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}")
231231
total_calls += 1

0 commit comments

Comments
 (0)