Skip to content

Commit 634a210

Browse files
authored
Exception on target failure (#39350)
* Exception on target failure * Fixing test by running in its own tmp directory * Fixing some failing tests * Fixing more tests
1 parent 93fd77c commit 634a210

File tree

6 files changed

+44
-13
lines changed

6 files changed

+44
-13
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55
from .code_client import CodeClient
66
from .proxy_client import ProxyClient
77
from .target_run_context import TargetRunContext
8+
from .proxy_client import ProxyRun
89

9-
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext"]
10+
__all__ = ["CodeClient", "ProxyClient", "EvalRunContext", "TargetRunContext", "ProxyRun"]

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
)
2828
from .._model_configurations import AzureAIProject, EvaluationResult, EvaluatorConfig
2929
from .._user_agent import USER_AGENT
30-
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext
30+
from ._batch_run import EvalRunContext, CodeClient, ProxyClient, TargetRunContext, ProxyRun
3131
from ._utils import (
3232
_apply_column_mapping,
3333
_log_metrics_and_instance_results,
@@ -448,7 +448,7 @@ def _validate_and_load_data(target, data, evaluators, output_path, azure_ai_proj
448448
def _apply_target_to_data(
449449
target: Callable,
450450
data: Union[str, os.PathLike],
451-
pf_client: PFClient,
451+
batch_client: TClient,
452452
initial_data: pd.DataFrame,
453453
evaluation_name: Optional[str] = None,
454454
**kwargs,
@@ -460,8 +460,8 @@ def _apply_target_to_data(
460460
:type target: Callable
461461
:param data: The path to input jsonl or csv file.
462462
:type data: Union[str, os.PathLike]
463-
:param pf_client: The promptflow client to be used.
464-
:type pf_client: PFClient
463+
:param batch_client: The promptflow client to be used.
464+
:type batch_client: PFClient
465465
:param initial_data: The data frame with the loaded data.
466466
:type initial_data: pd.DataFrame
467467
:param evaluation_name: The name of the evaluation.
@@ -471,15 +471,26 @@ def _apply_target_to_data(
471471
"""
472472
_run_name = kwargs.get("_run_name")
473473
with TargetRunContext():
474-
run: Run = pf_client.run(
474+
run: ProxyRun = batch_client.run(
475475
flow=target,
476476
display_name=evaluation_name,
477477
data=data,
478478
stream=True,
479479
name=_run_name,
480480
)
481481

482-
target_output: pd.DataFrame = pf_client.runs.get_details(run, all_results=True)
482+
target_output: pd.DataFrame = batch_client.get_details(run, all_results=True)
483+
run_summary = batch_client.get_run_summary(run)
484+
485+
if run_summary["completed_lines"] == 0:
486+
msg = (f"Evaluation target failed to produce any results."
487+
f" Please check the logs at {run_summary['log_path']} for more details about cause of failure.")
488+
raise EvaluationException(
489+
message=msg,
490+
target=ErrorTarget.EVALUATE,
491+
category=ErrorCategory.FAILED_EXECUTION,
492+
blame=ErrorBlame.USER_ERROR,
493+
)
483494
# Remove input and output prefix
484495
generated_columns = {
485496
col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
@@ -498,7 +509,7 @@ def _apply_target_to_data(
498509
# Concatenate output to input
499510
target_output = pd.concat([target_output, initial_data], axis=1)
500511

501-
return target_output, generated_columns, run
512+
return target_output, generated_columns, run.run.result()
502513

503514

504515
def _process_column_mappings(
@@ -727,7 +738,7 @@ def _evaluate( # pylint: disable=too-many-locals,too-many-statements
727738
target_generated_columns: Set[str] = set()
728739
if data is not None and target is not None:
729740
input_data_df, target_generated_columns, target_run = _apply_target_to_data(
730-
target, data, pf_client, input_data_df, evaluation_name, **kwargs
741+
target, data, ProxyClient(pf_client), input_data_df, evaluation_name, **kwargs
731742
)
732743

733744
for evaluator_name, mapping in column_mapping.items():

sdk/evaluation/azure-ai-evaluation/tests/conftest.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,3 +599,10 @@ def stop_promptflow_service() -> None:
599599
stop_service()
600600

601601
stop_promptflow_service()
602+
603+
@pytest.fixture
604+
def run_from_temp_dir(tmp_path):
605+
original_cwd = os.getcwd()
606+
os.chdir(tmp_path)
607+
yield
608+
os.chdir(original_cwd)

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ def questions_file():
3434
data_path = os.path.join(pathlib.Path(__file__).parent.resolve(), "data")
3535
return os.path.join(data_path, "questions.jsonl")
3636

37-
3837
def answer_evaluator(response):
3938
return {"length": len(response)}
4039

@@ -173,7 +172,7 @@ def test_evaluate_python_function(self, data_file, use_pf_client, function, colu
173172
assert metrics.get(metric) == list_mean_nan_safe(row_result_df[out_column])
174173
assert row_result_df[out_column][2] == 31
175174

176-
def test_evaluate_with_target(self, questions_file):
175+
def test_evaluate_with_target(self, questions_file, run_from_temp_dir):
177176
"""Test evaluation with target function."""
178177
# We cannot define target in this file as pytest will load
179178
# all modules in test folder and target_fn will be imported from the first
@@ -211,7 +210,7 @@ def test_evaluate_with_target(self, questions_file):
211210
{"default": {"column_mapping": {"another_question": "${target.query}"}}},
212211
],
213212
)
214-
def test_evaluate_another_questions(self, questions_file, evaluation_config):
213+
def test_evaluate_another_questions(self, questions_file, evaluation_config, run_from_temp_dir):
215214
"""Test evaluation with target function."""
216215
from .target_fn import target_fn3
217216

@@ -270,7 +269,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config):
270269
),
271270
],
272271
)
273-
def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config):
272+
def test_evaluate_with_evaluator_config(self, questions_file, evaluate_config, run_from_temp_dir):
274273
input_data = pd.read_json(questions_file, lines=True)
275274
from .target_fn import target_fn2
276275

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_mass_evaluate.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ def test_evaluate_multimodal(
308308
multimodal_input_selector,
309309
azure_cred,
310310
project_scope,
311+
run_from_temp_dir,
311312
):
312313
# ContentSafetyMultimodalEvaluator is excluded due 2 reasons:
313314
# - It fails in playback mode for some reason

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ def _target_fn2(query):
118118
response["query"] = f"The query is as follows: {query}"
119119
return response
120120

121+
def _target_that_fails(query):
122+
raise Exception("I am failing")
121123

122124
def _new_answer_target():
123125
return {"response": "new response"}
@@ -830,3 +832,13 @@ def test_malformed_file_inputs(self, model_config, missing_header_csv_file, miss
830832
)
831833

832834
assert "Either 'conversation' or individual inputs must be provided." in str(exc_info.value)
835+
836+
def test_target_failure_error_message(self, questions_file):
837+
with pytest.raises(EvaluationException) as exc_info:
838+
evaluate(
839+
data=questions_file,
840+
evaluators={"f1_score": F1ScoreEvaluator()},
841+
target=_target_that_fails,
842+
)
843+
844+
assert "Evaluation target failed to produce any results. Please check the logs at " in str(exc_info.value)

0 commit comments

Comments
 (0)