Skip to content

Commit 67fc87d

Browse files
author
Salma Elshafey
committed
Added field names and messages as constants
1 parent 8865240 commit 67fc87d

File tree

4 files changed

+49
-40
lines changed

4 files changed

+49
-40
lines changed

sdk/evaluation/azure-ai-evaluation/CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
- Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
1010

11-
- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a number in the range [0-1]. The number range is now [1-5].
11+
- Fixes and improvements to ToolCallAccuracy evaluator. New version has less variance. and now works on all tool calls that happen in a turn at once. Previously, it worked on each tool call independently without having context on the other tool calls that happen in the same turn, and then aggregated the results to a score in the range [0-1]. The score range is now [1-5].
1212

1313
- Fixed MeteorScoreEvaluator and other threshold-based evaluators returning incorrect binary results due to integer conversion of decimal scores. Previously, decimal scores like 0.9375 were incorrectly converted to integers (0) before threshold comparison, causing them to fail even when above the threshold. [#41415](https://github.com/Azure/azure-sdk-for-python/issues/41415)
1414

sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_tool_call_accuracy/_tool_call_accuracy.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,15 @@ class ToolCallAccuracyEvaluator(PromptyEvaluatorBase[Union[str, float]]):
7171
_MIN_TOOL_CALL_ACCURACY_SCORE = 1
7272
_DEFAULT_TOOL_CALL_ACCURACY_SCORE = 3
7373

74+
_NO_TOOL_CALLS_MESSAGE = "No tool calls found in response or provided tool_calls."
75+
_NO_TOOL_DEFINITIONS_MESSAGE = "Tool definitions must be provided."
76+
_TOOL_DEFINITIONS_MISSING_MESSAGE = "Tool definitions for all tool calls must be provided."
77+
_INVALID_SCORE_MESSAGE = "Tool call accuracy score must be between 1 and 5."
78+
79+
_LLM_SCORE_KEY = "tool_calls_success_level"
80+
_EXCESS_TOOL_CALLS_KEY = "excess_tool_calls"
81+
_MISSING_TOOL_CALLS_KEY = "missing_tool_calls"
82+
7483
id = "id"
7584
"""Evaluator identifier, experimental and to be used only with evaluation in cloud."""
7685

@@ -150,9 +159,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
150159
tool_calls = parsed_tool_calls
151160

152161
if not tool_calls:
153-
return {"error_message": "No tool calls found in response or provided tool_calls."}
162+
return {"error_message": self._NO_TOOL_CALLS_MESSAGE}
154163
if not tool_definitions or len(tool_definitions) == 0:
155-
return {"error_message": "Tool definitions must be provided."}
164+
return {"error_message": self._NO_TOOL_DEFINITIONS_MESSAGE}
156165

157166
if not isinstance(tool_calls, list):
158167
tool_calls = [tool_calls]
@@ -162,9 +171,9 @@ def _convert_kwargs_to_eval_input(self, **kwargs):
162171
try:
163172
needed_tool_definitions = self._extract_needed_tool_definitions(tool_calls, tool_definitions)
164173
except EvaluationException as e:
165-
return {"error_message": "Tool definitions for all tool calls must be provided."}
174+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
166175
if len(needed_tool_definitions) == 0:
167-
return {"error_message": "Tool definitions for all tool calls must be provided."}
176+
return {"error_message": self._TOOL_DEFINITIONS_MISSING_MESSAGE}
168177

169178
return {
170179
"query": query,
@@ -188,7 +197,7 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
188197
llm_output = await self._flow(timeout=self._LLM_CALL_TIMEOUT, **eval_input)
189198

190199
if isinstance(llm_output, dict):
191-
score = llm_output.get("tool_calls_success_level", None)
200+
score = llm_output.get(self._LLM_SCORE_KEY, None)
192201
if not score or not check_score_is_valid(score, ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE, ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE):
193202
raise EvaluationException(
194203
message=f"Invalid score value: {score}. Expected a number in range [{ToolCallAccuracyEvaluator._MIN_TOOL_CALL_ACCURACY_SCORE}, {ToolCallAccuracyEvaluator._MAX_TOOL_CALL_ACCURACY_SCORE}].",
@@ -208,8 +217,8 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str]]: # t
208217
f"{self._result_key}_reason": reason,
209218
'applicable': True,
210219
'per_tool_call_details': llm_output.get('additional_details', {}),
211-
'excess_tool_calls': llm_output.get('excess_tool_calls', {}),
212-
'missing_tool_calls': llm_output.get('missing_tool_calls', {}),
220+
self._EXCESS_TOOL_CALLS_KEY: llm_output.get(self._EXCESS_TOOL_CALLS_KEY, {}),
221+
self._MISSING_TOOL_CALLS_KEY: llm_output.get(self._MISSING_TOOL_CALLS_KEY, {}),
213222
}
214223
return response_dict
215224

@@ -255,8 +264,8 @@ def _not_applicable_result(self, error_message):
255264
f"{self._result_key}_reason": error_message,
256265
"applicable": False,
257266
"per_tool_call_details": {},
258-
"excess_tool_calls": {},
259-
"missing_tool_calls": {},
267+
self._EXCESS_TOOL_CALLS_KEY: {},
268+
self._MISSING_TOOL_CALLS_KEY: {},
260269

261270
}
262271

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_agent_evaluators.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
2727
}]
2828
)
2929
assert not result["applicable"]
30-
assert result["tool_call_accuracy"] == "not applicable"
31-
assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"]
30+
assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
31+
assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
3232

3333
# Test with missing tool_definitions
3434
result = tool_call_accuracy(
@@ -43,8 +43,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
4343
}]
4444
)
4545
assert not result["applicable"]
46-
assert result["tool_call_accuracy"] == "not applicable"
47-
assert "Tool definitions must be provided." in result["tool_call_accuracy_reason"]
46+
assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
47+
assert ToolCallAccuracyEvaluator._NO_TOOL_DEFINITIONS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
4848

4949
# Test with response that has no tool calls
5050
result = tool_call_accuracy(
@@ -65,8 +65,8 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
6565
}]
6666
)
6767
assert not result["applicable"]
68-
assert result["tool_call_accuracy"] == "not applicable"
69-
assert "No tool calls found in response or provided tool_calls." in result["tool_call_accuracy_reason"]
68+
assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
69+
assert ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]
7070

7171
# Test with tool call for which definition is not provided
7272
result = tool_call_accuracy(
@@ -91,5 +91,5 @@ def test_tool_call_accuracy_evaluator_missing_inputs(self, mock_model_config):
9191
}]
9292
)
9393
assert not result["applicable"]
94-
assert result["tool_call_accuracy"] == "not applicable"
95-
assert "Tool definitions for all tool calls must be provided." in result["tool_call_accuracy_reason"]
94+
assert result[ToolCallAccuracyEvaluator._RESULT_KEY] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
95+
assert ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE in result[f"{ToolCallAccuracyEvaluator._RESULT_KEY}_reason"]

sdk/evaluation/azure-ai-evaluation/tests/unittests/test_tool_call_accuracy_evaluator.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ async def flow_side_effect(timeout, **kwargs):
2121
"chain_of_thought": "The tool calls were very correct that I returned a huge number!",
2222
"tool_calls_success_level": 25,
2323
"additional_details": {},
24-
"excess_tool_calls": {},
25-
"missing_tool_calls": {}
24+
ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {},
25+
ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {}
2626
}
2727

2828
score = 1 # Default score for "all bad"
@@ -39,8 +39,8 @@ async def flow_side_effect(timeout, **kwargs):
3939
"tool_calls_made_by_agent": total_calls,
4040
"correct_tool_calls_made_by_agent": good_calls
4141
},
42-
"excess_tool_calls": {"total": 0},
43-
"missing_tool_calls": {"total": 0}
42+
ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY: {"total": 0},
43+
ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY: {"total": 0}
4444
}
4545

4646

@@ -102,8 +102,8 @@ def test_evaluate_tools_valid1(self, mock_model_config):
102102
assert f"{key}_reason" in result
103103
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 1 correct calls."
104104
assert "per_tool_call_details" in result
105-
assert "excess_tool_calls" in result
106-
assert "missing_tool_calls" in result
105+
assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
106+
assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
107107
assert result["applicable"] is True
108108

109109
def test_evaluate_tools_valid2(self, mock_model_config):
@@ -161,8 +161,8 @@ def test_evaluate_tools_valid2(self, mock_model_config):
161161
assert f"{key}_reason" in result
162162
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 0 correct calls."
163163
assert "per_tool_call_details" in result
164-
assert "excess_tool_calls" in result
165-
assert "missing_tool_calls" in result
164+
assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
165+
assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
166166
assert result["applicable"] is True
167167

168168
def test_evaluate_tools_valid3(self, mock_model_config):
@@ -220,8 +220,8 @@ def test_evaluate_tools_valid3(self, mock_model_config):
220220
assert f"{key}_reason" in result
221221
assert result[f"{key}_reason"] == "Evaluated 2 tool calls with 2 correct calls."
222222
assert "per_tool_call_details" in result
223-
assert "excess_tool_calls" in result
224-
assert "missing_tool_calls" in result
223+
assert ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY in result
224+
assert ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY in result
225225
assert result["applicable"] is True
226226

227227
def test_evaluate_tools_one_eval_fails(self, mock_model_config):
@@ -304,13 +304,13 @@ def test_evaluate_tools_some_not_applicable(self, mock_model_config):
304304

305305
key = ToolCallAccuracyEvaluator._RESULT_KEY
306306
assert result is not None
307-
assert result[key] == "not applicable"
307+
assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
308308
assert result[f"{key}_result"] == "pass"
309309
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
310-
assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided."
310+
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
311311
assert result["per_tool_call_details"] == {}
312-
assert result["excess_tool_calls"] == {}
313-
assert result["missing_tool_calls"] == {}
312+
assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
313+
assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
314314
assert result["applicable"] is False
315315

316316
def test_evaluate_tools_all_not_applicable(self, mock_model_config):
@@ -344,13 +344,13 @@ def test_evaluate_tools_all_not_applicable(self, mock_model_config):
344344

345345
key = ToolCallAccuracyEvaluator._RESULT_KEY
346346
assert result is not None
347-
assert result[key] == "not applicable"
347+
assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
348348
assert result[f"{key}_result"] == "pass"
349349
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
350-
assert result[f"{key}_reason"] == "Tool definitions for all tool calls must be provided."
350+
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._TOOL_DEFINITIONS_MISSING_MESSAGE
351351
assert result["per_tool_call_details"] == {}
352-
assert result["excess_tool_calls"] == {}
353-
assert result["missing_tool_calls"] == {}
352+
assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
353+
assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
354354
assert result["applicable"] is False
355355

356356
def test_evaluate_tools_no_tools(self, mock_model_config):
@@ -377,11 +377,11 @@ def test_evaluate_tools_no_tools(self, mock_model_config):
377377

378378
key = ToolCallAccuracyEvaluator._RESULT_KEY
379379
assert result is not None
380-
assert result[key] == "not applicable"
380+
assert result[key] == ToolCallAccuracyEvaluator._NOT_APPLICABLE_RESULT
381381
assert result[f"{key}_result"] == "pass"
382382
assert result[f"{key}_threshold"] == ToolCallAccuracyEvaluator._DEFAULT_TOOL_CALL_ACCURACY_SCORE
383-
assert result[f"{key}_reason"] == "No tool calls found in response or provided tool_calls."
383+
assert result[f"{key}_reason"] == ToolCallAccuracyEvaluator._NO_TOOL_CALLS_MESSAGE
384384
assert result["per_tool_call_details"] == {}
385-
assert result["excess_tool_calls"] == {}
386-
assert result["missing_tool_calls"] == {}
385+
assert result[ToolCallAccuracyEvaluator._EXCESS_TOOL_CALLS_KEY] == {}
386+
assert result[ToolCallAccuracyEvaluator._MISSING_TOOL_CALLS_KEY] == {}
387387
assert result["applicable"] is False

0 commit comments

Comments
 (0)