stanfordnlp
diff --git a/‎docs/docs/tutorials/agents/index.ipynb
Lines changed: 4 additions & 7 deletions b/‎docs/docs/tutorials/agents/index.ipynb
Lines changed: 4 additions & 7 deletions
diff --git a/‎docs/docs/tutorials/classification_finetuning/index.ipynb
Lines changed: 4 additions & 7 deletions b/‎docs/docs/tutorials/classification_finetuning/index.ipynb
Lines changed: 4 additions & 7 deletions
diff --git a/‎docs/docs/tutorials/entity_extraction/index.ipynb
Lines changed: 4 additions & 7 deletions b/‎docs/docs/tutorials/entity_extraction/index.ipynb
Lines changed: 4 additions & 7 deletions
diff --git a/‎docs/docs/tutorials/math/index.ipynb
Lines changed: 5 additions & 5 deletions b/‎docs/docs/tutorials/math/index.ipynb
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/docs/tutorials/multihop_search/index.ipynb
Lines changed: 4 additions & 7 deletions b/‎docs/docs/tutorials/multihop_search/index.ipynb
Lines changed: 4 additions & 7 deletions
diff --git a/‎docs/docs/tutorials/rag/index.ipynb
Lines changed: 9 additions & 7 deletions b/‎docs/docs/tutorials/rag/index.ipynb
Lines changed: 9 additions & 7 deletions
diff --git a/‎dspy/evaluate/evaluate.py
Lines changed: 26 additions & 37 deletions b/‎dspy/evaluate/evaluate.py
Lines changed: 26 additions & 37 deletions
diff --git a/‎dspy/teleprompt/bootstrap_finetune.py
Lines changed: 2 additions & 3 deletions b/‎dspy/teleprompt/bootstrap_finetune.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎dspy/teleprompt/copro_optimizer.py
Lines changed: 1 addition & 1 deletion b/‎dspy/teleprompt/copro_optimizer.py
Lines changed: 1 addition & 1 deletion
@@ -500,23 +500,20 @@
     "        metric=top5_recall,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(cot)\n",
+    "    result = evaluate(cot)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
+    "    mlflow.log_metric(\"top5_recall\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Claim\": [example.claim for example in eval_set],\n",
     "            \"Expected Titles\": [example.titles for example in eval_set],\n",
-    "            \"Predicted Titles\": outputs,\n",
-    "            \"Top 5 Recall\": all_scores,\n",
+    "            \"Predicted Titles\": [output[1] for output in result.results],\n",
+    "            \"Top 5 Recall\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
 
@@ -568,23 +568,20 @@
     "        metric=extraction_correctness_metric,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
+    "    result = evaluate_correctness(people_extractor)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"exact_match\", aggregated_score)\n",
+    "    mlflow.log_metric(\"exact_match\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Text\": [example.text for example in devset],\n",
     "            \"Expected\": [example.example_label for example in devset],\n",
-    "            \"Predicted\": outputs,\n",
-    "            \"Exact match\": all_scores,\n",
+    "            \"Predicted\": [output[1] for output in result.results],\n",
+    "            \"Exact match\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
 
@@ -514,23 +514,20 @@
         "        metric=extraction_correctness_metric,\n",
         "        num_threads=24,\n",
         "        display_progress=True,\n",
-        "        # To record the outputs and detailed scores to MLflow\n",
-        "        return_all_scores=True,\n",
-        "        return_outputs=True,\n",
         "    )\n",
         "\n",
         "    # Evaluate the program as usual\n",
-        "    aggregated_score, outputs, all_scores = evaluate_correctness(people_extractor)\n",
+        "    result = evaluate_correctness(people_extractor)\n",
         "\n",
         "    # Log the aggregated score\n",
-        "    mlflow.log_metric(\"exact_match\", aggregated_score)\n",
+        "    mlflow.log_metric(\"exact_match\", result.score)\n",
         "    # Log the detailed evaluation results as a table\n",
         "    mlflow.log_table(\n",
         "        {\n",
         "            \"Tokens\": [example.tokens for example in test_set],\n",
         "            \"Expected\": [example.expected_extracted_people for example in test_set],\n",
-        "            \"Predicted\": outputs,\n",
-        "            \"Exact match\": all_scores,\n",
+        "            \"Predicted\": [output[1] for output in result.results],\n",
+        "            \"Exact match\": [output[2] for output in result.results],\n",
         "        },\n",
         "        artifact_file=\"eval_results.json\",\n",
         "    )\n",
 
@@ -369,21 +369,21 @@
     "\n",
     "# Start an MLflow Run to record the evaluation\n",
     "with mlflow.start_run(run_name=\"math_evaluation\"):\n",
-    "    kwargs = dict(num_threads=THREADS, display_progress=True, return_all_scores=True, return_outputs=True)\n",
+    "    kwargs = dict(num_threads=THREADS, display_progress=True)\n",
     "    evaluate = dspy.Evaluate(devset=dataset.dev, metric=dataset.metric, **kwargs)\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(module)\n",
+    "    result = evaluate(module)\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"correctness\", aggregated_score)\n",
+    "    mlflow.log_metric(\"correctness\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Question\": [example.question for example in dataset.dev],\n",
     "            \"Gold Answer\": [example.answer for example in dataset.dev],\n",
-    "            \"Predicted Answer\": outputs,\n",
-    "            \"Correctness\": all_scores,\n",
+    "            \"Predicted Answer\": [output[1] for output in result.results],\n",
+    "            \"Correctness\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
 
@@ -534,23 +534,20 @@
     "        metric=top5_recall,\n",
     "        num_threads=16,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(Hop())\n",
+    "    result = evaluate(Hop())\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"top5_recall\", aggregated_score)\n",
+    "    mlflow.log_metric(\"top5_recall\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Claim\": [example.claim for example in eval_set],\n",
     "            \"Expected Titles\": [example.titles for example in eval_set],\n",
-    "            \"Predicted Titles\": outputs,\n",
-    "            \"Top 5 Recall\": all_scores,\n",
+    "            \"Predicted Titles\": [output[1] for output in result.results],\n",
+    "            \"Top 5 Recall\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
 
@@ -731,24 +731,21 @@
     "        metric=metric,\n",
     "        num_threads=24,\n",
     "        display_progress=True,\n",
-    "        # To record the outputs and detailed scores to MLflow\n",
-    "        return_all_scores=True,\n",
-    "        return_outputs=True,\n",
     "    )\n",
     "\n",
     "    # Evaluate the program as usual\n",
-    "    aggregated_score, outputs, all_scores = evaluate(cot)\n",
+    "    result = evaluate(cot)\n",
     "\n",
     "\n",
     "    # Log the aggregated score\n",
-    "    mlflow.log_metric(\"semantic_f1_score\", aggregated_score)\n",
+    "    mlflow.log_metric(\"semantic_f1_score\", result.score)\n",
     "    # Log the detailed evaluation results as a table\n",
     "    mlflow.log_table(\n",
     "        {\n",
     "            \"Question\": [example.question for example in eval_set],\n",
     "            \"Gold Response\": [example.response for example in eval_set],\n",
-    "            \"Predicted Response\": outputs,\n",
-    "            \"Semantic F1 Score\": all_scores,\n",
+    "            \"Predicted Response\": [output[1] for output in result.results],\n",
+    "            \"Semantic F1 Score\": [output[2] for output in result.results],\n",
     "        },\n",
     "        artifact_file=\"eval_results.json\",\n",
     "    )\n",
@@ -1471,6 +1468,11 @@
     "\n",
     "The first step is to look at your system outputs, which will allow you to identify the sources of lower performance if any. While doing all of this, make sure you continue to refine your metric, e.g. by optimizing against your judgments, and to collect more (or more realistic) data, e.g. from related domains or from putting a demo of your system in front of users."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
   }
  ],
  "metadata": {
 
@@ -9,6 +9,7 @@
 import tqdm
 
 import dspy
+from dspy.primitives.prediction import Prediction
 from dspy.utils.callback import with_callbacks
 from dspy.utils.parallelizer import ParallelExecutor
 
@@ -42,6 +43,21 @@ def HTML(x: str) -> str:  # noqa: N802
 logger = logging.getLogger(__name__)
 
 
+class EvaluationResult(Prediction):
+    """
+    A class that represents the result of an evaluation.
+    It is a subclass of `dspy.Prediction` that contains the following fields
+
+    - score: An float value (e.g., 67.30) representing the overall performance
+    - results: a list of (example, prediction, score) tuples for each example in devset
+    """
+    def __init__(self, score: float, results: list[Tuple["dspy.Example", "dspy.Example", Any]]):
+        super().__init__(score=score, results=results)
+
+    def __repr__(self):
+        return f"EvaluationResult(score={self.score}, results=<list of {len(self.results)} results>)"
+
+
 class Evaluate:
     """DSPy Evaluate class.
 
@@ -58,8 +74,6 @@ def __init__(
         display_progress: bool = False,
         display_table: Union[bool, int] = False,
         max_errors: Optional[int] = None,
-        return_all_scores: bool = False,
-        return_outputs: bool = False,
         provide_traceback: Optional[bool] = None,
         failure_score: float = 0.0,
         **kwargs,
@@ -74,8 +88,6 @@ def __init__(
                 If a number is passed, the evaluation results will be truncated to that number before displayed.
             max_errors (Optional[int]): The maximum number of errors to allow before
                 stopping evaluation. If ``None``, inherits from ``dspy.settings.max_errors``.
-            return_all_scores (bool): Whether to return scores for every data record in `devset`.
-            return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`.
             provide_traceback (Optional[bool]): Whether to provide traceback information during evaluation.
             failure_score (float): The default score to use if evaluation fails due to an exception.
         """
@@ -85,8 +97,6 @@ def __init__(
         self.display_progress = display_progress
         self.display_table = display_table
         self.max_errors = max_errors
-        self.return_all_scores = return_all_scores
-        self.return_outputs = return_outputs
         self.provide_traceback = provide_traceback
         self.failure_score = failure_score
 
@@ -99,10 +109,8 @@ def __call__(
         num_threads: Optional[int] = None,
         display_progress: Optional[bool] = None,
         display_table: Optional[Union[bool, int]] = None,
-        return_all_scores: Optional[bool] = None,
-        return_outputs: Optional[bool] = None,
         callback_metadata: Optional[dict[str, Any]] = None,
-    ):
+    ) -> EvaluationResult:
         """
         Args:
             program (dspy.Module): The DSPy program to evaluate.
@@ -114,36 +122,20 @@ def __call__(
                 `self.display_progress`.
             display_table (Union[bool, int]): Whether to display the evaluation results in a table. if not provided, use
                 `self.display_table`. If a number is passed, the evaluation results will be truncated to that number before displayed.
-            return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided,
-                use `self.return_all_scores`.
-            return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not
-                provided, use `self.return_outputs`.
             callback_metadata (dict): Metadata to be used for evaluate callback handlers.
 
         Returns:
-            The evaluation results are returned in different formats based on the flags:
-
-            - Base return: A float percentage score (e.g., 67.30) representing overall performance
-
-            - With `return_all_scores=True`:
-                Returns (overall_score, individual_scores) where individual_scores is a list of
-                float scores for each example in devset
-
-            - With `return_outputs=True`:
-                Returns (overall_score, result_triples) where result_triples is a list of
-                (example, prediction, score) tuples for each example in devset
-
-            - With both flags=True:
-                Returns (overall_score, result_triples, individual_scores)
-
+            The evaluation results are returned as a dspy.EvaluationResult object containing the following attributes:
+            
+            - score: A float percentage score (e.g., 67.30) representing overall performance
+            
+            - results: a list of (example, prediction, score) tuples for each example in devset
         """
         metric = metric if metric is not None else self.metric
         devset = devset if devset is not None else self.devset
         num_threads = num_threads if num_threads is not None else self.num_threads
         display_progress = display_progress if display_progress is not None else self.display_progress
         display_table = display_table if display_table is not None else self.display_table
-        return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
-        return_outputs = return_outputs if return_outputs is not None else self.return_outputs
 
         if callback_metadata:
             logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
@@ -194,14 +186,11 @@ def process_item(example):
             else:
                 logger.warning("Skipping table display since `pandas` is not installed.")
 
-        if return_all_scores and return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results, [score for *_, score in results]
-        if return_all_scores:
-            return round(100 * ncorrect / ntotal, 2), [score for *_, score in results]
-        if return_outputs:
-            return round(100 * ncorrect / ntotal, 2), results
+        return EvaluationResult(
+            score=round(100 * ncorrect / ntotal, 2),
+            results=results,
+        )
 
-        return round(100 * ncorrect / ntotal, 2)
 
     def _construct_result_table(
         self, results: list[Tuple["dspy.Example", "dspy.Example", Any]], metric_name: str
 
@@ -231,7 +231,6 @@ def bootstrap_trace_data(
         devset=dataset,
         num_threads=num_threads,
         display_progress=True,
-        return_outputs=True,
         provide_traceback=False,  # TODO(check with team)
         max_errors=len(dataset) * 10,  # TODO(check with team)
         failure_score=failure_score,
@@ -290,10 +289,10 @@ def wrapped_program(**kwargs):
 
                 return failed_pred, trace
 
-    _, outputs = evaluator(wrapped_program, metric=wrapped_metric)
+    results = evaluator(wrapped_program, metric=wrapped_metric).results
 
     data = []
-    for example_ind, (example, prediction, score) in enumerate(outputs):
+    for example_ind, (example, prediction, score) in enumerate(results):
         try:
             prediction, trace = prediction
         except ValueError as ve:
 
@@ -225,7 +225,7 @@ def compile(self, student, *, trainset, eval_kwargs):
                         f"At Depth {d+1}/{self.depth}, Evaluating Prompt Candidate #{c_i+1}/{len(candidates_)} for "
                         f"Predictor {p_i+1} of {len(module.predictors())}.",
                     )
-                    score = evaluate(module_clone, devset=trainset, **eval_kwargs)
+                    score = evaluate(module_clone, devset=trainset, **eval_kwargs).score
                     if self.prompt_model:
                         logger.debug(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}")
                     total_calls += 1
Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ def compile(self, student, *, trainset, eval_kwargs):`
`225`	`225`	`f"At Depth {d+1}/{self.depth}, Evaluating Prompt Candidate #{c_i+1}/{len(candidates_)} for "`
`226`	`226`	`f"Predictor {p_i+1} of {len(module.predictors())}.",`
`227`	`227`	`)`
`228`		`- score = evaluate(module_clone, devset=trainset, **eval_kwargs)`
	`228`	`+ score = evaluate(module_clone, devset=trainset, **eval_kwargs).score`
`229`	`229`	`if self.prompt_model:`
`230`	`230`	`logger.debug(f"prompt_model.inspect_history(n=1) {self.prompt_model.inspect_history(n=1)}")`
`231`	`231`	`total_calls += 1`