Exception on target failure (#39350)

singankit · web-flow · commit 634a2101b6bd · 2025-01-24T15:33:34.000-08:00
* Exception on target failure

* Fixing test by running in its own tmp directory

* Fixing some failing tests

* Fixing more tests
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/__init__.py
@@ -5,5 +5,6 @@
 from .code_client import CodeClient
 from .proxy_client import ProxyClient
 from .target_run_context import TargetRunContext
+from .proxy_client import ProxyRun
 
-__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
+__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -27,7 +27,7 @@
 )
 from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
 from .._user_agent import USER_AGENT
-from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
+from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
 from ._utils import (
     _apply_column_mapping,
     _log_metrics_and_instance_results,
@@ -448,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
 def _apply_target_to_data(
     target: Callable,
     data: Union[str, os.PathLike],
-    pf_client: PFClient,
+    batch_client: TClient,
     initial_data: pd.DataFrame,
     evaluation_name: Optional[str] = None,
     **kwargs,
@@ -460,8 +460,8 @@ def _apply_target_to_data(
     :type target: Callable
     :param data: The path to input jsonl or csv file.
     :type data: Union[str, os.PathLike]
-    :param pf_client: The promptflow client to be used.
-    :type pf_client: PFClient
+    :param batch_client: The promptflow client to be used.
+    :type batch_client: PFClient
     :param initial_data: The data frame with the loaded data.
     :type initial_data: pd.DataFrame
     :param evaluation_name: The name of the evaluation.
@@ -471,15 +471,26 @@ def _apply_target_to_data(
     """
     _run_name = kwargs.get("_run_name")
     with TargetRunContext():
-        run: Run = pf_client.run(
+        run: ProxyRun = batch_client.run(
             flow=target,
             display_name=evaluation_name,
             data=data,
             stream=True,
             name=_run_name,
         )
 
-    target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
+    target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
+    run_summary = batch_client.get_run_summary(run)
+
+    if run_summary["completed_lines"] == 0:
+        msg = (f"Evaluation target failed to produce any results."
+               f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
+        raise EvaluationException(
+            message=msg,
+            target=ErrorTarget.EVALUATE,
+            category=ErrorCategory.FAILED_EXECUTION,
+            blame=ErrorBlame.USER_ERROR,
+        )
     # Remove input and output prefix
     generated_columns = {
         col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -498,7 +509,7 @@ def _apply_target_to_data(
     # Concatenate output to input
     target_output = pd.concat([target_output, initial_data], axis=1)
 
-    return target_output, generated_columns, run
+    return target_output, generated_columns, run.run.result()
 
 
 def _process_column_mappings(
@@ -727,7 +738,7 @@ def _evaluate(  # pylint: disable=too-many-locals,too-many-statements
     target_generated_columns: Set[str] = set()
     if data is not None and target is not None:
         input_data_df, target_generated_columns, target_run = _apply_target_to_data(
-            target, data, pf_client, input_data_df, evaluation_name, **kwargs
+            target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
         )
 
         for evaluator_name, mapping in column_mapping.items():
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/conftest.py b/sdk/evaluation/azure-ai-evaluation/tests/conftest.py
@@ -599,3 +599,10 @@ def stop_promptflow_service() -> None:
         stop_service()
 
     stop_promptflow_service()
+
+@pytest.fixture
+def run_from_temp_dir(tmp_path):
+    original_cwd = os.getcwd()
+    os.chdir(tmp_path)
+    yield
+    os.chdir(original_cwd)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -34,7 +34,6 @@ def questions_file():
     data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
     return os.path.join(data_path, "questions.jsonl")
 
-
 def answer_evaluator(response):
     return {"length": len(response)}
 
@@ -173,7 +172,7 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu
         assert metrics.get(metric) == list_mean_nan_safe(row_result_df[out_column])
         assert row_result_df[out_column][2] == 31
 
-    def test_evaluate_with_target(self, questions_file):
+    def test_evaluate_with_target(self, questions_file, run_from_temp_dir):
         """Test evaluation with target function."""
         # We cannot define target in this file as pytest will load
         # all modules in test folder and target_fn will be imported from the first
@@ -211,7 +210,7 @@ def test_evaluate_with_target(self, questions_file):
             {"default": {"column_mapping": {"another_question": "${target.query}"}}},
         ],
     )
-    def test_evaluate_another_questions(self, questions_file, evaluation_config):
+    def test_evaluate_another_questions(self, questions_file, evaluation_config, run_from_temp_dir):
         """Test evaluation with target function."""
         from .target_fn import target_fn3
 
@@ -270,7 +269,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
             ),
         ],
     )
-    def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config):
+    def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config, run_from_temp_dir):
         input_data = pd.read_json(questions_file, lines=True)
         from .target_fn import target_fn2
 
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py
@@ -308,6 +308,7 @@ def test_evaluate_multimodal(
         multimodal_input_selector,
         azure_cred,
         project_scope,
+        run_from_temp_dir,
     ):
         # ContentSafetyMultimodalEvaluator is excluded due 2 reasons:
         # - It fails in playback mode for some reason
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -118,6 +118,8 @@ def _target_fn2(query):
     response["query"] = f"The query is as follows: {query}"
     return response
 
+def _target_that_fails(query):
+    raise Exception("I am failing")
 
 def _new_answer_target():
     return {"response": "new response"}
@@ -830,3 +832,13 @@ def test_malformed_file_inputs(self, model_config, missing_header_csv_file, miss
             )
 
         assert "Either 'conversation' or individual inputs must be provided." in str(exc_info.value)
+
+    def test_target_failure_error_message(self, questions_file):
+        with pytest.raises(EvaluationException) as exc_info:
+            evaluate(
+                data=questions_file,
+                evaluators={"f1_score": F1ScoreEvaluator()},
+                target=_target_that_fails,
+            )
+
+        assert "Evaluation target failed to produce any results. Please check the logs at " in str(exc_info.value)