Skip to content

Commit 6a7c9a3

Browse files
nagkumar91Nagkumar ArkalgudNagkumar ArkalgudCopilot
authored
Bugfix/paginate aoai results (#41839)
* Prepare evals SDK Release * Fix bug * Fix for ADV_CONV for FDP projects * Update release date * re-add pyrit to matrix * Change grader ids * Update unit test * replace all old grader IDs in tests * Update platform-matrix.json Add pyrit and not remove the other one * AOAI results pagination * Bugfix: Get AOAI results paginated * Update test to ensure everything is mocked * tox/black fixes * Fix test * Skip that test with issues * better variable name, explaing why limit is 100 * Update sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * remove linit * validated successful runs for aoai evaluators for merging multiple results without limit param * Add tests --------- Co-authored-by: Nagkumar Arkalgud <nagkumar@naarkalg-work-mac.local> Co-authored-by: Nagkumar Arkalgud <nagkumar@Mac.lan> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent b8a77e8 commit 6a7c9a3

File tree

4 files changed

+396
-10
lines changed

4 files changed

+396
-10
lines changed

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class OAIEvalRunCreationInfo(TypedDict, total=True):
3232

3333

3434
def _split_evaluators_and_grader_configs(
35-
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]]
35+
evaluators: Dict[str, Union[Callable, AzureOpenAIGrader]],
3636
) -> Tuple[Dict[str, Callable], Dict[str, AzureOpenAIGrader]]:
3737
"""
3838
Given a dictionary of strings to Evaluators and AOAI graders. Identity which is which, and return two
@@ -203,6 +203,7 @@ def _get_single_run_results(
203203
"""
204204
# Wait for evaluation run to complete
205205
run_results = _wait_for_run_conclusion(run_info["client"], run_info["eval_group_id"], run_info["eval_run_id"])
206+
206207
if run_results.status != "completed":
207208
raise EvaluationException(
208209
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
@@ -211,10 +212,7 @@ def _get_single_run_results(
211212
category=ErrorCategory.FAILED_EXECUTION,
212213
target=ErrorTarget.AOAI_GRADER,
213214
)
214-
LOGGER.info(
215-
f"AOAI: Evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
216-
+ " completed successfully. Gathering results..."
217-
)
215+
218216
# Convert run results into a dictionary of metrics
219217
run_metrics = {}
220218
if run_results.per_testing_criteria_results is None:
@@ -247,13 +245,37 @@ def _get_single_run_results(
247245
# The passed and score values are then added to the results dictionary, prepended with the grader's name
248246
# as entered by the user in the inputted dictionary.
249247
# Other values, if they exist, are also added to the results dictionary.
250-
raw_list_results = run_info["client"].evals.runs.output_items.list(
251-
eval_id=run_info["eval_group_id"], run_id=run_info["eval_run_id"]
252-
)
248+
249+
# Collect all results with pagination
250+
all_results = []
251+
next_cursor = None
252+
limit = 100 # Max allowed by API
253+
254+
while True:
255+
# Build kwargs for the API call
256+
list_kwargs = {"eval_id": run_info["eval_group_id"], "run_id": run_info["eval_run_id"], "limit": limit}
257+
if next_cursor is not None:
258+
list_kwargs["after"] = next_cursor
259+
260+
raw_list_results = run_info["client"].evals.runs.output_items.list(**list_kwargs)
261+
262+
# Add current page results
263+
all_results.extend(raw_list_results.data)
264+
265+
# Check for more pages
266+
if hasattr(raw_list_results, "has_more") and raw_list_results.has_more:
267+
if hasattr(raw_list_results, "data") and len(raw_list_results.data) > 0:
268+
# Get the last item's ID for cursor-based pagination
269+
next_cursor = raw_list_results.data[-1].id
270+
else:
271+
break
272+
else:
273+
break
274+
253275
listed_results = {"index": []}
254276
# raw data has no order guarantees, we need to sort them by their
255277
# datasource_item_id
256-
for row_result in raw_list_results.data:
278+
for row_result in all_results:
257279
# Add the datasource_item_id for later sorting
258280
listed_results["index"].append(row_result.datasource_item_id)
259281
for single_grader_row_result in row_result.results:
@@ -273,6 +295,19 @@ def _get_single_run_results(
273295
if formatted_column_name not in listed_results:
274296
listed_results[formatted_column_name] = []
275297
listed_results[formatted_column_name].append(value)
298+
299+
# Ensure all columns have the same length as the index
300+
num_rows = len(listed_results["index"])
301+
for col_name in list(listed_results.keys()):
302+
if col_name != "index":
303+
col_length = len(listed_results[col_name])
304+
if col_length < num_rows:
305+
# Pad with None values
306+
listed_results[col_name].extend([None] * (num_rows - col_length))
307+
elif col_length > num_rows:
308+
# This shouldn't happen, but truncate if it does
309+
listed_results[col_name] = listed_results[col_name][:num_rows]
310+
276311
output_df = pd.DataFrame(listed_results)
277312
# sort by index
278313
output_df = output_df.sort_values("index", ascending=[True])

sdk/evaluation/azure-ai-evaluation/tests/e2etests/test_aoai_graders.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,110 @@ def target(query: str):
180180
assert len(metrics.keys()) == 2
181181
assert metrics["similarity.pass_rate"] == 1.0
182182
assert metrics["string_check.pass_rate"] == 0.3333333333333333
183+
184+
@pytest.mark.skipif(not is_live(), reason="AOAI recordings have bad recording scrubbing")
185+
def test_evaluate_with_large_dataset_pagination(self, model_config):
186+
"""Test AOAI graders with a large dataset that requires pagination"""
187+
# Create a large dataset that will trigger pagination (>100 rows)
188+
large_data = []
189+
for i in range(150): # Create 150 rows to ensure pagination
190+
large_data.append({"query": f"What is {i}?", "ground_truth": f"This is item {i}", "answer": f"Item {i}"})
191+
192+
# Create a temporary file with the large dataset
193+
import tempfile
194+
import json
195+
196+
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
197+
for item in large_data:
198+
f.write(json.dumps(item) + "\n")
199+
temp_file = f.name
200+
201+
try:
202+
# Use a simple string check grader
203+
string_grader = AzureOpenAIStringCheckGrader(
204+
model_config=model_config,
205+
input="{{item.query}}",
206+
name="contains_what",
207+
operation="like",
208+
reference="What",
209+
)
210+
211+
evaluators = {
212+
"string_check": string_grader,
213+
}
214+
215+
# Run evaluation with large dataset
216+
result = evaluate(data=temp_file, evaluators=evaluators, _use_run_submitter_client=True)
217+
218+
row_result_df = pd.DataFrame(result["rows"])
219+
metrics = result["metrics"]
220+
221+
# Verify all 150 rows were processed
222+
assert len(row_result_df) == 150
223+
assert len(row_result_df["outputs.string_check.passed"]) == 150
224+
assert len(row_result_df["outputs.string_check.score"]) == 150
225+
226+
# Verify metrics
227+
assert "string_check.pass_rate" in metrics
228+
assert metrics["string_check.pass_rate"] == 1.0 # All should pass
229+
230+
finally:
231+
# Clean up temp file
232+
os.unlink(temp_file)
233+
234+
@pytest.mark.skipif(not is_live(), reason="AOAI recordings have bad recording scrubbing")
235+
def test_evaluate_multiple_graders_with_pagination(self, model_config):
236+
"""Test multiple AOAI graders with pagination to ensure proper result mapping"""
237+
# Create dataset with 120 rows
238+
large_data = []
239+
for i in range(120):
240+
large_data.append({"query": f"Hello world {i}", "answer": f"Response {i}"})
241+
242+
import tempfile
243+
import json
244+
245+
with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
246+
for item in large_data:
247+
f.write(json.dumps(item) + "\n")
248+
temp_file = f.name
249+
250+
try:
251+
# Create multiple graders
252+
string_grader1 = AzureOpenAIStringCheckGrader(
253+
model_config=model_config,
254+
input="{{item.query}}",
255+
name="contains_hello",
256+
operation="like",
257+
reference="Hello",
258+
)
259+
260+
string_grader2 = AzureOpenAIStringCheckGrader(
261+
model_config=model_config,
262+
input="{{item.query}}",
263+
name="contains_world",
264+
operation="like",
265+
reference="world",
266+
)
267+
268+
evaluators = {
269+
"hello_check": string_grader1,
270+
"world_check": string_grader2,
271+
}
272+
273+
# Run evaluation
274+
result = evaluate(data=temp_file, evaluators=evaluators, _use_run_submitter_client=True)
275+
276+
row_result_df = pd.DataFrame(result["rows"])
277+
278+
# Verify all rows processed for both graders
279+
assert len(row_result_df) == 120
280+
assert len(row_result_df["outputs.hello_check.passed"]) == 120
281+
assert len(row_result_df["outputs.world_check.passed"]) == 120
282+
283+
# Verify both graders have 100% pass rate
284+
metrics = result["metrics"]
285+
assert metrics["hello_check.pass_rate"] == 1.0
286+
assert metrics["world_check.pass_rate"] == 1.0
287+
288+
finally:
289+
os.unlink(temp_file)

0 commit comments

Comments
 (0)