Azure
diff --git a/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py
Lines changed: 44 additions & 9 deletions b/‎sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py
Lines changed: 44 additions & 9 deletions
diff --git a/‎sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_aoai_graders.py
Lines changed: 107 additions & 0 deletions b/‎sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_aoai_graders.py
Lines changed: 107 additions & 0 deletions
@@ -32,7 +32,7 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
 
 
 def _split_evaluators_and_grader_configs(
-    evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
+    evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
 ) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
     """
     Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
@@ -203,6 +203,7 @@ def _get_single_run_results(
     """
     # Wait for evaluation run to complete
     run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
+
     if run_results.status != "completed":
         raise EvaluationException(
             message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
@@ -211,10 +212,7 @@ def _get_single_run_results(
             category=ErrorCategory.FAILED_EXECUTION,
             target=ErrorTarget.AOAI_GRADER,
         )
-    LOGGER.info(
-        f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
-        + " completed successfully. Gathering results..."
-    )
+
     # Convert run results into a dictionary of metrics
     run_metrics = {}
     if run_results.per_testing_criteria_results is None:
@@ -247,13 +245,37 @@ def _get_single_run_results(
     # The passed and score values are then added to the results dictionary, prepended with the grader's name
     # as entered by the user in the inputted dictionary.
     # Other values, if they exist, are also added to the results dictionary.
-    raw_list_results = run_info["client"].evals.runs.output_items.list(
-        eval_id=run_info["eval_group_id"], run_id=run_info["eval_run_id"]
-    )
+
+    # Collect all results with pagination
+    all_results = []
+    next_cursor = None
+    limit = 100  # Max allowed by API
+
+    while True:
+        # Build kwargs for the API call
+        list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
+        if next_cursor is not None:
+            list_kwargs["after"] = next_cursor
+
+        raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
+
+        # Add current page results
+        all_results.extend(raw_list_results.data)
+
+        # Check for more pages
+        if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
+            if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
+                # Get the last item's ID for cursor-based pagination
+                next_cursor = raw_list_results.data[-1].id
+            else:
+                break
+        else:
+            break
+
     listed_results = {"index": []}
     # raw data has no order guarantees, we need to sort them by their
     # datasource_item_id
-    for row_result in raw_list_results.data:
+    for row_result in all_results:
         # Add the datasource_item_id for later sorting
         listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
@@ -273,6 +295,19 @@ def _get_single_run_results(
                 if formatted_column_name not in listed_results:
                     listed_results[formatted_column_name] = []
                 listed_results[formatted_column_name].append(value)
+
+    # Ensure all columns have the same length as the index
+    num_rows = len(listed_results["index"])
+    for col_name in list(listed_results.keys()):
+        if col_name != "index":
+            col_length = len(listed_results[col_name])
+            if col_length < num_rows:
+                # Pad with None values
+                listed_results[col_name].extend([None] * (num_rows - col_length))
+            elif col_length > num_rows:
+                # This shouldn't happen, but truncate if it does
+                listed_results[col_name] = listed_results[col_name][:num_rows]
+
     output_df = pd.DataFrame(listed_results)
     # sort by index
     output_df = output_df.sort_values("index", ascending=[True])
 
@@ -180,3 +180,110 @@ def target(query: str):
         assert len(metrics.keys()) == 2
         assert metrics["similarity.pass_rate"] == 1.0
         assert metrics["string_check.pass_rate"] == 0.3333333333333333
+
+    @pytest.mark.skipif(not is_live(), reason="AOAI recordings have bad recording scrubbing")
+    def test_evaluate_with_large_dataset_pagination(self, model_config):
+        """Test AOAI graders with a large dataset that requires pagination"""
+        # Create a large dataset that will trigger pagination (>100 rows)
+        large_data = []
+        for i in range(150):  # Create 150 rows to ensure pagination
+            large_data.append({"query": f"What is {i}?", "ground_truth": f"This is item {i}", "answer": f"Item {i}"})
+
+        # Create a temporary file with the large dataset
+        import tempfile
+        import json
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            for item in large_data:
+                f.write(json.dumps(item) + "\n")
+            temp_file = f.name
+
+        try:
+            # Use a simple string check grader
+            string_grader = AzureOpenAIStringCheckGrader(
+                model_config=model_config,
+                input="{{item.query}}",
+                name="contains_what",
+                operation="like",
+                reference="What",
+            )
+
+            evaluators = {
+                "string_check": string_grader,
+            }
+
+            # Run evaluation with large dataset
+            result = evaluate(data=temp_file, evaluators=evaluators, _use_run_submitter_client=True)
+
+            row_result_df = pd.DataFrame(result["rows"])
+            metrics = result["metrics"]
+
+            # Verify all 150 rows were processed
+            assert len(row_result_df) == 150
+            assert len(row_result_df["outputs.string_check.passed"]) == 150
+            assert len(row_result_df["outputs.string_check.score"]) == 150
+
+            # Verify metrics
+            assert "string_check.pass_rate" in metrics
+            assert metrics["string_check.pass_rate"] == 1.0  # All should pass
+
+        finally:
+            # Clean up temp file
+            os.unlink(temp_file)
+
+    @pytest.mark.skipif(not is_live(), reason="AOAI recordings have bad recording scrubbing")
+    def test_evaluate_multiple_graders_with_pagination(self, model_config):
+        """Test multiple AOAI graders with pagination to ensure proper result mapping"""
+        # Create dataset with 120 rows
+        large_data = []
+        for i in range(120):
+            large_data.append({"query": f"Hello world {i}", "answer": f"Response {i}"})
+
+        import tempfile
+        import json
+
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+            for item in large_data:
+                f.write(json.dumps(item) + "\n")
+            temp_file = f.name
+
+        try:
+            # Create multiple graders
+            string_grader1 = AzureOpenAIStringCheckGrader(
+                model_config=model_config,
+                input="{{item.query}}",
+                name="contains_hello",
+                operation="like",
+                reference="Hello",
+            )
+
+            string_grader2 = AzureOpenAIStringCheckGrader(
+                model_config=model_config,
+                input="{{item.query}}",
+                name="contains_world",
+                operation="like",
+                reference="world",
+            )
+
+            evaluators = {
+                "hello_check": string_grader1,
+                "world_check": string_grader2,
+            }
+
+            # Run evaluation
+            result = evaluate(data=temp_file, evaluators=evaluators, _use_run_submitter_client=True)
+
+            row_result_df = pd.DataFrame(result["rows"])
+
+            # Verify all rows processed for both graders
+            assert len(row_result_df) == 120
+            assert len(row_result_df["outputs.hello_check.passed"]) == 120
+            assert len(row_result_df["outputs.world_check.passed"]) == 120
+
+            # Verify both graders have 100% pass rate
+            metrics = result["metrics"]
+            assert metrics["hello_check.pass_rate"] == 1.0
+            assert metrics["world_check.pass_rate"] == 1.0
+
+        finally:
+            os.unlink(temp_file)