Fill target errors with nans for evaluate (#41919)

nagkumar91 · Nagkumar Arkalgud · Nagkumar Arkalgud · web-flow · commit 861d148d6245 · 2025-07-10T11:52:16.000-07:00
* Prepare evals SDK Release

* Fix bug

* Fix for ADV_CONV for FDP projects

* Update release date

* re-add pyrit to matrix

* Change grader ids

* Update unit test

* replace all old grader IDs in tests

* Update platform-matrix.json

Add pyrit and not remove the other one

* Update test to ensure everything is mocked

* tox/black fixes

* Skip that test with issues

* update grader ID according to API View feedback

* Update test

* remove string check for grader ID

* Update changelog and officialy start freeze

* update the enum according to suggestions

* update the changelog

* Finalize logic

* Fill the dataset when target doesn't respond with all columns

* Tox fixes

* Send dataframe instead of previous run

* tox fixes

* Add a test

* more fox fixes

* Fix failing e2e test

* Update regex to solve the column mapping

* Re add a validation step

---------

Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@naarkalg-work-mac.local&gt;
Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@Mac.lan&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_batch_run/_run_submitter_client.py
@@ -44,7 +44,6 @@ def run(
         # input. Update the inputs so that each entry is a dictionary with a data key
         # that contains the original input data.
         inputs = [{"data": input_data} for input_data in data.to_dict(orient="records")]
-
         # Pass the correct previous run to the evaluator
         run: Optional[BatchClientRun] = kwargs.pop("run", None)
         if run:
@@ -73,6 +72,7 @@ def run(
         return run_future
 
     def get_details(self, client_run: BatchClientRun, all_results: bool = False) -> pd.DataFrame:
+
         run = self._get_run(client_run)
 
         data: Dict[str, List[Any]] = defaultdict(list)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -7,6 +7,8 @@
 import logging
 import os
 import re
+import tempfile
+import json
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypedDict, Union, cast
 
 from openai import OpenAI, AzureOpenAI
@@ -611,13 +613,32 @@ def _apply_target_to_data(
             category=ErrorCategory.FAILED_EXECUTION,
             blame=ErrorBlame.USER_ERROR,
         )
+
+    # Log a warning if some rows failed
+    failed_lines = run_summary.get("failed_lines", 0)
+    completed_lines = run_summary["completed_lines"]
+    total_lines = failed_lines + completed_lines
+
+    if failed_lines > 0:
+        LOGGER.warning(
+            f"Target function completed {completed_lines} out of {total_lines} rows. "
+            f"{failed_lines} rows failed and will be filled with NaN values."
+        )
+
     # Remove input and output prefix
     generated_columns = {
         col[len(Prefixes.OUTPUTS) :] for col in target_output.columns if col.startswith(Prefixes.OUTPUTS)
     }
     # Sort output by line numbers
     target_output.set_index(f"inputs.{LINE_NUMBER}", inplace=True)
     target_output.sort_index(inplace=True)
+
+    initial_data_with_line_numbers = initial_data.copy()
+    initial_data_with_line_numbers[LINE_NUMBER] = range(len(initial_data))
+
+    complete_index = initial_data_with_line_numbers[LINE_NUMBER]
+    target_output = target_output.reindex(complete_index)
+
     target_output.reset_index(inplace=True, drop=False)
     # target_output contains only input columns, taken by function,
     # so we need to concatenate it to the input data frame.
@@ -626,8 +647,8 @@ def _apply_target_to_data(
     # Rename outputs columns to __outputs
     rename_dict = {col: col.replace(Prefixes.OUTPUTS, Prefixes.TSG_OUTPUTS) for col in target_output.columns}
     target_output.rename(columns=rename_dict, inplace=True)
-    # Concatenate output to input
-    target_output = pd.concat([target_output, initial_data], axis=1)
+    # Concatenate output to input - now both dataframes have the same number of rows
+    target_output = pd.concat([initial_data, target_output], axis=1)
 
     return target_output, generated_columns, run
 
@@ -645,7 +666,7 @@ def _process_column_mappings(
 
     processed_config: Dict[str, Dict[str, str]] = {}
 
-    expected_references = re.compile(r"^\$\{(target|data)\.[a-zA-Z0-9_]+\}$")
+    expected_references = re.compile(r"^\$\{(target|data)\.([a-zA-Z0-9_]+(?:\.[a-zA-Z0-9_]+)*)\}$")
 
     if column_mapping:
         for evaluator, mapping_config in column_mapping.items():
@@ -1013,17 +1034,50 @@ def _preprocess_data(
             target, batch_run_data, batch_run_client, input_data_df, evaluation_name, **kwargs
         )
 
-        for evaluator_name, mapping in column_mapping.items():
-            mapped_to_values = set(mapping.values())
-            for col in target_generated_columns:
-                # If user defined mapping differently, do not change it.
-                # If it was mapped to target, we have already changed it
-                # in _process_column_mappings
-                run_output = f"${{run.outputs.{col}}}"
-                # We will add our mapping only if
-                # customer did not mapped target output.
-                if col not in mapping and run_output not in mapped_to_values:
-                    column_mapping[evaluator_name][col] = run_output  # pylint: disable=unnecessary-dict-index-lookup
+        # IMPORTANT FIX: For ProxyClient, create a temporary file with the complete dataframe
+        # This ensures that evaluators get all rows (including failed ones with NaN values)
+        if isinstance(batch_run_client, ProxyClient):
+            # Create a temporary JSONL file with the complete dataframe
+            temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False)
+            try:
+                for _, row in input_data_df.iterrows():
+                    row_dict = row.to_dict()
+                    temp_file.write(json.dumps(row_dict) + "\n")
+                temp_file.close()
+                batch_run_data = temp_file.name
+
+                # Update column mappings to use data references instead of run outputs
+                for evaluator_name, mapping in column_mapping.items():
+                    mapped_to_values = set(mapping.values())
+                    for col in target_generated_columns:
+                        # Use data reference instead of run output to ensure we get all rows
+                        target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
+
+                        # We will add our mapping only if customer did not map target output.
+                        if col not in mapping and target_reference not in mapped_to_values:
+                            column_mapping[evaluator_name][col] = target_reference
+
+                # Don't pass the target_run since we're now using the complete dataframe
+                target_run = None
+
+            except Exception as e:
+                # Clean up the temp file if something goes wrong
+                if os.path.exists(temp_file.name):
+                    os.unlink(temp_file.name)
+                raise e
+        else:
+            # For DataFrame-based clients, update batch_run_data to use the updated input_data_df
+            batch_run_data = input_data_df
+
+            # Update column mappings for DataFrame clients
+            for evaluator_name, mapping in column_mapping.items():
+                mapped_to_values = set(mapping.values())
+                for col in target_generated_columns:
+                    target_reference = f"${{data.{Prefixes.TSG_OUTPUTS}{col}}}"
+
+                    # We will add our mapping only if customer did not map target output.
+                    if col not in mapping and target_reference not in mapped_to_values:
+                        column_mapping[evaluator_name][col] = target_reference
 
     # After we have generated all columns, we can check if we have everything we need for evaluators.
     _validate_columns_for_evaluators(input_data_df, evaluators, target, target_generated_columns, column_mapping)
@@ -1062,30 +1116,50 @@ def _run_callable_evaluators(
     batch_run_data = validated_data["batch_run_data"]
     column_mapping = validated_data["column_mapping"]
     evaluators = validated_data["evaluators"]
-    with EvalRunContext(batch_run_client):
-        runs = {
-            evaluator_name: batch_run_client.run(
-                flow=evaluator,
-                data=batch_run_data,
-                run=target_run,
-                evaluator_name=evaluator_name,
-                column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
-                stream=True,
-                name=kwargs.get("_run_name"),
-            )
-            for evaluator_name, evaluator in evaluators.items()
-        }
 
-        # get_details needs to be called within EvalRunContext scope in order to have user agent populated
-        per_evaluator_results: Dict[str, __EvaluatorInfo] = {
-            evaluator_name: {
-                "result": batch_run_client.get_details(run, all_results=True),
-                "metrics": batch_run_client.get_metrics(run),
-                "run_summary": batch_run_client.get_run_summary(run),
+    # Clean up temporary file after evaluation if it was created
+    temp_file_to_cleanup = None
+    if (
+        isinstance(batch_run_client, ProxyClient)
+        and isinstance(batch_run_data, str)
+        and batch_run_data.endswith(".jsonl")
+    ):
+        # Check if it's a temporary file (contains temp directory path)
+        if tempfile.gettempdir() in batch_run_data:
+            temp_file_to_cleanup = batch_run_data
+
+    try:
+        with EvalRunContext(batch_run_client):
+            runs = {
+                evaluator_name: batch_run_client.run(
+                    flow=evaluator,
+                    data=batch_run_data,
+                    # Don't pass target_run when using complete dataframe
+                    run=target_run,
+                    evaluator_name=evaluator_name,
+                    column_mapping=column_mapping.get(evaluator_name, column_mapping.get("default", None)),
+                    stream=True,
+                    name=kwargs.get("_run_name"),
+                )
+                for evaluator_name, evaluator in evaluators.items()
             }
-            for evaluator_name, run in runs.items()
-        }
 
+            # get_details needs to be called within EvalRunContext scope in order to have user agent populated
+            per_evaluator_results: Dict[str, __EvaluatorInfo] = {
+                evaluator_name: {
+                    "result": batch_run_client.get_details(run, all_results=True),
+                    "metrics": batch_run_client.get_metrics(run),
+                    "run_summary": batch_run_client.get_run_summary(run),
+                }
+                for evaluator_name, run in runs.items()
+            }
+    finally:
+        # Clean up temporary file if it was created
+        if temp_file_to_cleanup and os.path.exists(temp_file_to_cleanup):
+            try:
+                os.unlink(temp_file_to_cleanup)
+            except Exception as e:
+                LOGGER.warning(f"Failed to clean up temporary file {temp_file_to_cleanup}: {e}")
     # Concatenate all results
     evaluators_result_df = pd.DataFrame()
     evaluators_metric = {}
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_evaluate.py
@@ -209,12 +209,12 @@ def test_evaluate_with_target(self, questions_file, run_from_temp_dir):
             None,
             {"default": {}},
             {"default": {}, "question_ev": {}},
-            {"default": {"column_mapping": {"query": "${target.query}"}}},
+            {"default": {"column_mapping": {"query": "${data.__outputs.query}"}}},
             {"default": {"column_mapping": {"query": "${data.query}"}}},
             {"default": {}, "question_ev": {"column_mapping": {"query": "${data.query}"}}},
-            {"default": {}, "question_ev": {"column_mapping": {"query": "${target.query}"}}},
-            {"default": {}, "question_ev": {"column_mapping": {"another_question": "${target.query}"}}},
-            {"default": {"column_mapping": {"another_question": "${target.query}"}}},
+            {"default": {}, "question_ev": {"column_mapping": {"query": "${data.__outputs.query}"}}},
+            {"default": {}, "question_ev": {"column_mapping": {"another_question": "${data.__outputs.query}"}}},
+            {"default": {"column_mapping": {"another_question": "${data.__outputs.query}"}}},
         ],
     )
     def test_evaluate_another_questions(self, questions_file, evaluation_config, run_from_temp_dir):
@@ -241,7 +241,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config, run
         if evaluation_config:
             config = evaluation_config.get("question_ev", evaluation_config.get("default", None))
             mapping = config.get("column_mapping", config)
-        if mapping and ("another_question" in mapping or mapping["query"] == "${data.query}"):
+        if mapping and ("another_question" in mapping or mapping.get("query") == "${data.query}"):
             query = "inputs.query"
         expected = list(row_result_df[query].str.len())
         assert expected == list(row_result_df["outputs.question_ev.length"])
@@ -259,7 +259,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config, run
                     },
                     "answer": {
                         "column_mapping": {
-                            "response": "${target.response}",
+                            "response": "${data.__outputs.response}",
                         }
                     },
                 }
@@ -268,7 +268,7 @@ def test_evaluate_another_questions(self, questions_file, evaluation_config, run
                 {
                     "default": {
                         "column_mapping": {
-                            "response": "${target.response}",
+                            "response": "${data.__outputs.response}",
                             "ground_truth": "${data.ground_truth}",
                         }
                     },
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_mismatch.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_mismatch.py